In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier

In [2]:
DATA_DIR = Path().resolve().parent / "data"
train_csv = DATA_DIR / "raw" / "ml_ozon_сounterfeit_train.csv"
test_csv = DATA_DIR / "raw" / "ml_ozon_сounterfeit_test.csv" 

df_train = pd.read_csv(train_csv)
df_test = pd.read_csv(test_csv)

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197198 entries, 0 to 197197
Data columns (total 45 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            197198 non-null  int64  
 1   resolution                    197198 non-null  int64  
 2   brand_name                    116667 non-null  object 
 3   description                   171138 non-null  object 
 4   name_rus                      197198 non-null  object 
 5   CommercialTypeName4           197198 non-null  object 
 6   rating_1_count                47193 non-null   float64
 7   rating_2_count                47193 non-null   float64
 8   rating_3_count                47193 non-null   float64
 9   rating_4_count                47193 non-null   float64
 10  rating_5_count                47193 non-null   float64
 11  comments_published_count      47193 non-null   float64
 12  photos_published_count        47193 non-null

In [4]:
def avg_rate(row):
    if row["rating_1_count"] == None:
        return None
    rating = 0
    rate_count = 0
    for i in range(1, 6):
        rate_key = f"rating_{i}_count"
        rating += row[rate_key] * i
        rate_count += row[rate_key]
    return rating/rate_count

def merge_ratings(df):
    df["rating"] = df[[f"rating_{i}_count" for i in range(1, 6)]].apply(avg_rate, axis=1)
    df = df.drop(columns=[f"rating_{i}_count" for i in range(1, 6)])
    return df

In [5]:
# чёт скор хуже с этими изменениями
# df_test = merge_ratings(df_test)
# df_train = merge_ratings(df_train)

In [6]:
TARGET = "resolution"
ID_COL = "id"
drop_cols = [ID_COL, TARGET, "name_rus", "description", "ItemID", "SellerID", "brand_name", "CommercialTypeName4"]
feature_cols = [c for c in df_train.columns if c not in drop_cols]

In [7]:
X = df_train[feature_cols]
y = df_train[TARGET].astype(int)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

lgbm = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    class_weight="balanced",
)

lgbm.fit(
    X_tr, y_tr,
    eval_set=[(X_va, y_va)],
    eval_metric="binary_logloss",
)

p_va = lgbm.predict_proba(X_va)[:, 1]

ths = np.linspace(0.05, 0.95, 19)
f1_by_thr = [(t, f1_score(y_va, (p_va >= t).astype(int))) for t in ths]
best_thr, best_f1 = max(f1_by_thr, key=lambda x: x[1])
print(f"Best F1={best_f1:.4f} @ thr={best_thr:.2f}")


[LightGBM] [Info] Number of positive: 10442, number of negative: 147316
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030833 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6781
[LightGBM] [Info] Number of data points in the train set: 157758, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Best F1=0.6579 @ thr=0.80


In [9]:
X_test = df_test[feature_cols]

p_test = lgbm.predict_proba(X_test)[:, 1]
pred_test = (p_test >= best_thr).astype(int)

submission = pd.DataFrame({
    ID_COL: df_test[ID_COL],
    "prediction": pred_test,
})

submission.to_csv("submission.csv", index=False)
print("Saved: submission.csv")


Saved: submission.csv
