# Базовый пайплайн для соревнования по определению контрафакта 

### 1. Загрузка данных

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings

df_train = pd.read_csv('ml_ozon_сounterfeit_train.csv', index_col=0)
df_test = pd.read_csv('ml_ozon_сounterfeit_test.csv', index_col=0)

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print(f"Target distribution in train:")
print(df_train['resolution'].value_counts())
print()

Train shape: (197198, 44)
Test shape: (22760, 43)
Target distribution in train:
resolution
0    184146
1     13052
Name: count, dtype: int64



### 2. Предобработка данных
Используем 39 числовых признаков

In [37]:
numeric_columns = df_train.select_dtypes(include=[np.number]).columns.tolist()
numeric_columns = [col for col in numeric_columns if col != 'resolution']



In [38]:
X_train = df_train[numeric_columns].fillna(0)
y_train = df_train['resolution']
X_test = df_test[numeric_columns].fillna(0)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print()


X_train shape: (197198, 39)
X_test shape: (22760, 39)



### 3. Обучение модели

In [39]:

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

val_pred = model.predict(X_val_split)
val_accuracy = accuracy_score(y_val_split, val_pred)

print(f"Validation accuracy: {val_accuracy:.4f}")
print("Classification report:")
print(classification_report(y_val_split, val_pred))
print()


Validation accuracy: 0.9651
Classification report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     36830
           1       0.88      0.54      0.67      2610

    accuracy                           0.97     39440
   macro avg       0.93      0.77      0.83     39440
weighted avg       0.96      0.97      0.96     39440




### 4. Формирование submission.csv

In [40]:

test_predictions = model.predict(X_test)

submission = pd.DataFrame({
    'id': df_test.index,
    'prediction': test_predictions
})

submission.to_csv('submission.csv', index=False)


print(f"Создан файл submission.csv с {len(submission)} предсказаниями")
print(f"Распределение предсказаний:")
print(submission['prediction'].value_counts())
print()

Создан файл submission.csv с 22760 предсказаниями
Распределение предсказаний:
prediction
0    22465
1      295
Name: count, dtype: int64

