In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
import pickle
import joblib

In [2]:
df = pd.read_csv('cleaned_ozon_data_TRAIN.csv')

### Выбираем все столбцы с текстовыми данными и векторизуем их

In [5]:
df['all_text'] = (
    df['description'].fillna('') + ' ' +
    df['name_rus'].fillna('') + ' ' +
    df['brand_name'].fillna('') + ' ' +
    df['CommercialTypeName4'].fillna('')
)

tfidf = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1, 2),
    min_df=2
)

X_text = tfidf.fit_transform(df['all_text'])

### Объединяем числовые и текстовые признаки

In [6]:
numeric_features = [
    'rating_1_count', 'rating_2_count', 'rating_3_count', 'rating_4_count', 'rating_5_count',
    'comments_published_count', 'photos_published_count', 'videos_published_count',
    'PriceDiscounted', 'item_time_alive', 'item_count_sales30', 'GmvTotal30'
]

numeric_features = [f for f in numeric_features if f in df.columns]

X_numeric = df[numeric_features].fillna(0).values
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_numeric)

X_combined = hstack([X_text, csr_matrix(X_numeric_scaled)])

### Разделяем тестовую и тренировочную переменную

In [7]:
# Целевая переменная
y = df['resolution'].values

# Разделяем данные
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  
)

### Обучение модели (самый простой способ)

In [None]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print(f"F1-score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

### Пайплайн модели

In [None]:
pipeline = {
    'tfidf': tfidf,
    'scaler': scaler,
    'model': model,
    'numeric_features': numeric_features
}

### Читаем тестовые данные

In [None]:
new_data = pd.read_csv('cleaned_ozon_data.csv')

### Выбираем нужные колонки из тестовых данных

In [None]:
new_data['combined_text'] = (
    new_data['description'].fillna('') + ' ' +
    new_data['name_rus'].fillna('') + ' ' +
    new_data['brand_name'].fillna('') + ' ' +
    new_data['CommercialTypeName4'].fillna('')
)

### Преобразование признаков

In [None]:
X_new_text = pipeline['tfidf'].transform(new_data['combined_text'])
X_new_numeric = pipeline['scaler'].transform(new_data[pipeline['numeric_features']].fillna(0))
X_new_combined = hstack([X_new_text, csr_matrix(X_new_numeric)])

### Использование пайплайна

In [None]:
prediction = pipeline['model'].predict(X_new_combined)
prediction_proba = pipeline['model'].predict_proba(X_new_combined)

print(f"Предсказание: {'Контрафакт' if prediction[0] == 1 else 'Оригинал'}")
print(f"Вероятности: {prediction_proba[0]}")

### Файл Submition

In [None]:
submission = pd.DataFrame({
    'id': new_data['id'],  # используем id из тестовых данных
    'prediction': prediction
})

submission.to_csv('submission.csv', index=False)
print("Файл submission.csv создан!")
print(f"Количество строк в submission: {len(submission)}")