<a href="https://colab.research.google.com/github/pqmnyx/hexagon-purpleHack-CLTV/blob/main/cltv_hex_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
!pip freeze | grep "numpy\|pandas\|lightgbm\|scikit-learn"

geopandas==0.13.2
lightgbm==4.1.0
numpy==1.25.2
pandas==1.5.3
pandas-datareader==0.10.0
pandas-gbq==0.19.2
pandas-stubs==1.5.3.230304
scikit-learn==1.2.2
sklearn-pandas==2.2.0


In [5]:
train_df = pd.read_parquet("/content/drive/MyDrive/CLTV/train_data.pqt")
test_df = pd.read_parquet("/content/drive/MyDrive/CLTV/test_data.pqt")

In [6]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

In [7]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

In [8]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

x_train, x_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

In [9]:
model = LGBMClassifier(verbosity=-1, random_state=42, n_jobs=-1)
model.fit(x_train, y_train)

In [10]:
cluster_weights = pd.read_excel("/content/drive/MyDrive/CLTV/cluster_weights.xlsx").set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

In [11]:
def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

In [12]:
y_pred_proba = model.predict_proba(x_val)
y_pred_proba.shape

(120000, 17)

In [13]:
weighted_roc_auc(y_val, y_pred_proba, model.classes_, weights_dict)

0.7435520713146397

# **Тест**

In [14]:
test_df.pivot(index="id", columns="date", values="start_cluster").head(8)

date,month_4,month_5,month_6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200000,{α},{α},
200001,{α},{α},
200002,{other},{other},
200003,{α},{α},
200004,,{},
200005,{α},{α},
200006,{α},{α},
200007,{α},{α},


In [15]:
for index, row in test_df.iterrows():
    if row.date == 'month_6':
        test_df.at[index, 'start_cluster'] = test_df.at[index-1, 'start_cluster']

In [16]:
test_df.pivot(index="id", columns="date", values="start_cluster").head(6)

date,month_4,month_5,month_6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
200000,{α},{α},{α}
200001,{α},{α},{α}
200002,{other},{other},{other}
200003,{α},{α},{α}
200004,,{},{}
200005,{α},{α},{α}


Заполняем упущенные значения средним

In [17]:
(test_df.std()/test_df.mean()*100).head(45).sort_values()

  (test_df.std()/test_df.mean()*100).head(45).sort_values()
  (test_df.std()/test_df.mean()*100).head(45).sort_values()


sum_of_paym_6m             -24558.933644
sum_deb_f_oper_1m          -24375.943085
ogrn_days_end_month        -22795.242382
ogrn_days_end_quarter      -10698.951618
sum_cred_f_oper_1m          -9009.947911
sum_c_oper_1m               -6666.737394
balance_amt_max             -4448.450220
balance_amt_min             -4237.590651
balance_amt_avg             -3830.367437
balance_amt_day_avg         -3755.952494
sum_a_oper_1m               -3534.323344
cnt_cred_f_oper_1m              0.021212
cnt_deb_f_oper_1m               0.035337
cnt_cred_e_oper_1m              0.056233
cnt_deb_e_oper_1m               0.188138
cnt_deb_d_oper_1m               0.648396
cnt_c_oper_1m                   0.691125
cnt_deb_h_oper_1m               1.568864
cnt_deb_g_oper_1m               2.171343
cnt_a_oper_1m                   3.228715
cnt_days_cred_f_oper_1m         4.316983
cnt_cred_d_oper_1m              4.437694
cnt_b_oper_1m                   4.910742
cnt_cred_g_oper_1m              5.703461
id              

In [18]:
test_filled_df = test_df
test_filled_df["cnt_cred_f_oper_1m"].fillna(value=test_df["cnt_cred_f_oper_1m"].mean(skipna=True))

test_filled_df["cnt_deb_f_oper_1m"].fillna(value=test_df["cnt_deb_f_oper_1m"].mean(skipna=True))

test_filled_df["cnt_cred_e_oper_1m"].fillna(value=test_df["cnt_cred_e_oper_1m"].mean(skipna=True))

test_filled_df["cnt_deb_e_oper_1m"].fillna(value=test_df["cnt_deb_e_oper_1m"].mean(skipna=True))

test_filled_df["cnt_deb_d_oper_1m"].fillna(value=test_df["cnt_deb_d_oper_1m"].mean(skipna=True))

test_filled_df["cnt_c_oper_1m"].fillna(value=test_df["cnt_c_oper_1m"].mean(skipna=True))

0         0.552726
1         0.556064
2         0.557473
3         0.551206
4         0.551206
            ...   
290115    0.551206
290116    0.550798
290117    0.550798
290118    0.550798
290119    0.550798
Name: cnt_c_oper_1m, Length: 290120, dtype: float64

In [19]:
test_filled_df[cat_cols] = test_filled_df[cat_cols].astype("category")

In [20]:
test_filled_df.head(3)


Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
2,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,{α}


Test with means

In [21]:
last_m_test_df = test_filled_df[test_filled_df["date"] == "month_6"]
last_m_test_df = last_m_test_df.drop(["id", "date"], axis=1)

In [22]:
test_pred_proba = model.predict_proba(last_m_test_df)
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=model.classes_)
sorted_classes = sorted(test_pred_proba_df.columns.to_list())
test_pred_proba_df = test_pred_proba_df[sorted_classes]

In [23]:
test_pred_proba_df.shape

(100000, 17)

In [24]:
sample_submission_df = pd.read_csv("/content/drive/MyDrive/CLTV/sample_submission.csv")

In [25]:
sample_submission_df.head(3)

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,200001,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,200002,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


In [26]:
sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("/content/drive/MyDrive/CLTV/cltv-3.csv", index=False)