# Baseline

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

## Загрузка данных

In [3]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")
train_df.shape, test_df.shape

((600000, 93), (290120, 92))

In [4]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

In [5]:
def m6_prepoc(data, type):
    if type == 0:
        tmp = data.drop(columns='end_cluster')
        t1 = tmp.loc[tmp.date =='month_1']
        t2 = tmp.loc[tmp.date =='month_2']
        t3 = tmp.loc[tmp.date =='month_3']
    elif type == 1:
        tmp = data.copy()
        t1 = tmp.loc[tmp.date =='month_4']
        t2 = tmp.loc[tmp.date =='month_5']
        t3 = tmp.loc[tmp.date =='month_6']
    t1.reset_index(inplace=True, drop='index')
    t2.reset_index(inplace=True, drop='index')
    t3.reset_index(inplace=True, drop='index')
    data_m3 = t1.merge(t2, on='id', how='right', suffixes=['_m1', '_m2'])
    data_m3 = data_m3.merge(t3, on='id', how='right')
    data_m3 = data_m3.drop(columns=['date', 'date_m2', 'date_m1'])

    return data_m3


In [6]:
train_m3 = m6_prepoc(train_df, 0)
test_m3 = m6_prepoc(test_df, 1)

In [7]:
cat_l = ['channel_code_m1',
 'city_m1',
 'city_type_m1',
 'index_city_code_m1',
 'ogrn_month_m1',
 'ogrn_year_m1',
 'okved_m1',
 'segment_m1',
 'start_cluster_m1',
 'channel_code_m2',
 'city_m2',
 'city_type_m2',
 'index_city_code_m2',
 'ogrn_month_m2',
 'ogrn_year_m2',
 'okved_m2',
 'segment_m2',
 'start_cluster_m2',
 'channel_code',
 'city',
 'city_type',
 'index_city_code',
 'ogrn_month',
 'ogrn_year',
 'okved',
 'segment',
 'start_cluster']

In [8]:
cat_feats = list(train_m3.select_dtypes(include=['object']).columns)
num_feats = [col for col in train_m3.columns if col not in cat_feats and col != 'id']
feats = cat_feats + num_feats

In [9]:
for feat in cat_feats:
    train_m3[feat] = train_m3[feat].fillna('GOD')
    test_m3[feat] = test_m3[feat].fillna('GOD')

for feat in num_feats:
    train_m3[feat] = train_m3[feat].fillna(train_m3[feat].median())
    test_m3[feat] = test_m3[feat].fillna(train_m3[feat].median())

In [10]:
train_m3[cat_feats] = train_m3[cat_feats].astype("category")
test_m3[cat_feats] = test_m3[cat_feats].astype("category")

In [11]:
train_m3["end_cluster"] = train_df.loc[train_df.date =='month_3'].reset_index()["end_cluster"]

  train_m3["end_cluster"] = train_df.loc[train_df.date =='month_3'].reset_index()["end_cluster"]


In [12]:
train_m3 = train_m3[cat_feats + ["end_cluster"]].drop_duplicates()

In [13]:
# cat_feats.remove('start_cluster')

In [14]:
# from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import normalize
# le = LabelEncoder()
# for feat in cat_feats:
#     le.fit(pd.concat([train_m3[feat], test_m3[feat]]))
#     train_m3[feat] = le.transform(train_m3[feat])
#     test_m3[feat] = le.transform(test_m3[feat])

In [15]:
# def feats_filter_nunique(df: pd.DataFrame,
#                          features: list,
#                          unique_threshold: float):
#     cols_to_leave = []
#     for feat in features:
#         if df[feat].nunique() >= 2 and df[feat].astype(str).value_counts(1).values[0] < unique_threshold:
#             cols_to_leave += [feat]
#     return cols_to_leave
# print(len(feats))
# feats = feats_filter_nunique(train_m3, feats, 0.95)
# num_feat = [feat for feat in num_feats if feat in feats]
# cat_feat = [feat for feat in cat_feats if feat in feats]
# len(feats)

# def feats_filter_corr(df: pd.DataFrame,
#                       features: list,
#                       corr_threshold: float):
#     nan_df = df[features].isna().mean(axis=0).reset_index().rename(columns={'index': 'col',
#                                                                             0: 'nan_mean'}).sort_values(by = 'nan_mean', ascending = True)
#     corr_df = train_m3[nan_df.col.values].corr().reset_index()
#     to_drop_by_corr = set()
#     for i, row in corr_df.iterrows():
#         curr_cols = row[i+2:]
#         curr_to_drop = set(curr_cols[curr_cols >= corr_threshold].index)
#         to_drop_by_corr = to_drop_by_corr.union(curr_to_drop)
#     cols_to_leave = [col for col in features if col not in to_drop_by_corr]
#     return cols_to_leave, to_drop_by_corr
# print(len(feats))
# num_feat, droped = feats_filter_corr(train_m3, num_feat, 0.95)
# feats = num_feat + cat_feat
# print(droped)
# len(feats)

# from catboost import CatBoostClassifier
# def feats_filter_random_feat(df: pd.DataFrame,
#                              features: list,
#                              cat_features: list):

#     df['random'] = np.random.uniform(0, 1, train_m3.shape[0])

#     simple_model = CatBoostClassifier(random_state = 42, verbose=False, early_stopping_rounds=50, cat_features=cat_features)
#     simple_model.fit(df[features + ['random']], train_df.loc[train_df.date =='month_3']["end_cluster"])
#     model_importance = pd.DataFrame({'col': features + ['random'],
#                       'importance': simple_model.get_feature_importance()})

#     cols_to_leave = list(model_importance[model_importance['importance'] > \
#                                           model_importance.loc[model_importance['col'] == 'random', 'importance'].values[0]]['col'])

#     return cols_to_leave

# selected_features = feats_filter_random_feat(train_m3, feats, cat_feats)
# selected_features_num = [feat for feat in num_feat if feat in selected_features]
# selected_features_cat = [feat for feat in cat_feat if feat in selected_features]
# len(selected_features)

In [16]:
X = train_m3[cat_feats].copy()
y = train_m3.start_cluster

X_test = test_m3[cat_feats].copy()

In [17]:
model = LGBMClassifier(verbosity=-1, random_state=42, n_jobs=-1, objective='multiclass', n_estimators=100)
model.fit(X.drop(columns='start_cluster'), y)

In [18]:
test_m3['start_cluster'] = model.predict(X_test.drop(columns='start_cluster'))

In [19]:
cluster_weights = pd.read_excel("cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [20]:
X = train_m3[cat_feats].copy()
y = train_m3["end_cluster"]

x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)
model = CatBoostClassifier(verbose=150, random_state=42, cat_features=cat_feats, task_type='GPU')
model.fit(x_train, y_train)
y_pred_proba = model.predict_proba(x_val)
y_pred_proba.shape
weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

Learning rate set to 0.181621
0:	learn: 1.6541596	total: 50.7ms	remaining: 50.7s
150:	learn: 0.9128110	total: 6.88s	remaining: 38.7s
300:	learn: 0.8888888	total: 13.5s	remaining: 31.3s
450:	learn: 0.8673083	total: 20.3s	remaining: 24.7s
600:	learn: 0.8464170	total: 27s	remaining: 17.9s
750:	learn: 0.8281356	total: 33.5s	remaining: 11.1s
900:	learn: 0.8081534	total: 40.5s	remaining: 4.45s
999:	learn: 0.7966253	total: 45s	remaining: 0us


0.8586308750676405

In [21]:
sample_submission_df = pd.read_csv("sample_submission.csv")

In [22]:
sample_submission_df.shape

(100000, 18)

In [23]:
model_final = CatBoostClassifier(verbose=150, random_state=42, cat_features=cat_feats, task_type='GPU')
model_final.fit(X, y)

Learning rate set to 0.19008
0:	learn: 1.6146078	total: 215ms	remaining: 3m 34s
150:	learn: 0.9157127	total: 7.66s	remaining: 43.1s
300:	learn: 0.8957631	total: 15s	remaining: 34.8s
450:	learn: 0.8765635	total: 22.4s	remaining: 27.2s
600:	learn: 0.8585762	total: 30s	remaining: 19.9s
750:	learn: 0.8411154	total: 37.7s	remaining: 12.5s
900:	learn: 0.8230073	total: 46s	remaining: 5.06s
999:	learn: 0.8131872	total: 51.1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x29003b48760>

In [24]:
# test_m3['start_cluster'] = test_m3['start_cluster_m2']

In [25]:
test_pred_proba = model_final.predict_proba(test_m3[cat_feats])
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [26]:
test_pred_proba_df.shape

(100000, 17)

In [27]:
test_pred_proba_df.head(2)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.004693,0.129156,0.020204,0.010316,0.000941,2.2e-05,6.8e-05,4e-06,0.001943,0.002913,0.003948,4.9e-05,0.000641,4.10508e-07,0.000547,0.824434,0.000119
1,0.012453,0.263004,0.00402,0.009359,0.00146,7.8e-05,0.012816,6.9e-05,0.003734,0.018457,0.004012,0.000583,0.002538,5.381956e-08,0.000284,0.667125,8e-06


In [28]:
sample_submission_df = pd.read_csv("sample_submission.csv")
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("baseline_submission_123.csv", index=False)