In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import precision_recall_curve, f1_score, auc, average_precision_score, roc_auc_score, roc_curve, classification_report
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

from lightgbm import LGBMClassifier

%load_ext autoreload
%autoreload 2
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

SEED = 42

In [6]:
df = pd.read_csv('okved_sum_with_private.csv')
df.head()

Unnamed: 0,hash_inn,okved2,region,0_sum_kt,1_sum_kt,10_sum_kt,11_sum_kt,12_sum_kt,13_sum_kt,14_sum_kt,15_sum_kt,16_sum_kt,17_sum_kt,18_sum_kt,19_sum_kt,2_sum_kt,20_sum_kt,21_sum_kt,22_sum_kt,23_sum_kt,24_sum_kt,25_sum_kt,26_sum_kt,27_sum_kt,28_sum_kt,29_sum_kt,3_sum_kt,30_sum_kt,31_sum_kt,32_sum_kt,33_sum_kt,34_sum_kt,35_sum_kt,36_sum_kt,37_sum_kt,38_sum_kt,39_sum_kt,4_sum_kt,40_sum_kt,41_sum_kt,42_sum_kt,43_sum_kt,44_sum_kt,45_sum_kt,46_sum_kt,47_sum_kt,48_sum_kt,49_sum_kt,5_sum_kt,50_sum_kt,51_sum_kt,52_sum_kt,53_sum_kt,54_sum_kt,55_sum_kt,56_sum_kt,57_sum_kt,58_sum_kt,59_sum_kt,6_sum_kt,60_sum_kt,61_sum_kt,62_sum_kt,63_sum_kt,64_sum_kt,65_sum_kt,66_sum_kt,67_sum_kt,68_sum_kt,69_sum_kt,7_sum_kt,70_sum_kt,71_sum_kt,72_sum_kt,73_sum_kt,74_sum_kt,75_sum_kt,76_sum_kt,77_sum_kt,78_sum_kt,79_sum_kt,8_sum_kt,9_sum_kt,cnt_not_null_kt,total_kt,share_0_kt,share_1_kt,share_10_kt,share_11_kt,share_12_kt,share_13_kt,share_14_kt,share_15_kt,share_16_kt,share_17_kt,share_18_kt,share_19_kt,share_2_kt,share_20_kt,share_21_kt,share_22_kt,share_23_kt,share_24_kt,share_25_kt,share_26_kt,share_27_kt,share_28_kt,share_29_kt,share_3_kt,share_30_kt,share_31_kt,share_32_kt,share_33_kt,share_34_kt,share_35_kt,share_36_kt,share_37_kt,share_38_kt,share_39_kt,share_4_kt,share_40_kt,share_41_kt,share_42_kt,share_43_kt,share_44_kt,share_45_kt,share_46_kt,share_47_kt,share_48_kt,share_49_kt,share_5_kt,share_50_kt,share_51_kt,share_52_kt,share_53_kt,share_54_kt,share_55_kt,share_56_kt,share_57_kt,share_58_kt,share_59_kt,share_6_kt,share_60_kt,share_61_kt,share_62_kt,share_63_kt,share_64_kt,share_65_kt,share_66_kt,share_67_kt,share_68_kt,share_69_kt,share_7_kt,share_70_kt,share_71_kt,share_72_kt,share_73_kt,share_74_kt,share_75_kt,share_76_kt,share_77_kt,share_78_kt,share_79_kt,share_8_kt,share_9_kt,0_sum_dt_dt,1_sum_dt_dt,10_sum_dt_dt,11_sum_dt_dt,12_sum_dt_dt,13_sum_dt_dt,14_sum_dt_dt,15_sum_dt_dt,16_sum_dt_dt,17_sum_dt_dt,18_sum_dt_dt,19_sum_dt_dt,2_sum_dt_dt,20_sum_dt_dt,21_sum_dt_dt,22_sum_dt_dt,23_sum_dt_dt,24_sum_dt_dt,25_sum_dt_dt,26_sum_dt_dt,27_sum_dt_dt,28_sum_dt_dt,29_sum_dt_dt,3_sum_dt_dt,30_sum_dt_dt,31_sum_dt_dt,32_sum_dt_dt,33_sum_dt_dt,34_sum_dt_dt,35_sum_dt_dt,36_sum_dt_dt,37_sum_dt_dt,38_sum_dt_dt,39_sum_dt_dt,4_sum_dt_dt,40_sum_dt_dt,41_sum_dt_dt,42_sum_dt_dt,43_sum_dt_dt,44_sum_dt_dt,45_sum_dt_dt,46_sum_dt_dt,47_sum_dt_dt,48_sum_dt_dt,49_sum_dt_dt,5_sum_dt_dt,50_sum_dt_dt,51_sum_dt_dt,52_sum_dt_dt,53_sum_dt_dt,54_sum_dt_dt,55_sum_dt_dt,56_sum_dt_dt,57_sum_dt_dt,58_sum_dt_dt,59_sum_dt_dt,6_sum_dt_dt,60_sum_dt_dt,61_sum_dt_dt,62_sum_dt_dt,63_sum_dt_dt,64_sum_dt_dt,65_sum_dt_dt,66_sum_dt_dt,67_sum_dt_dt,68_sum_dt_dt,69_sum_dt_dt,7_sum_dt_dt,70_sum_dt_dt,71_sum_dt_dt,72_sum_dt_dt,73_sum_dt_dt,74_sum_dt_dt,75_sum_dt_dt,76_sum_dt_dt,77_sum_dt_dt,78_sum_dt_dt,79_sum_dt_dt,8_sum_dt_dt,9_sum_dt_dt,cnt_not_null_dt,total_dt,share_0_dt_dt,share_1_dt_dt,share_10_dt_dt,share_11_dt_dt,share_12_dt_dt,share_13_dt_dt,share_14_dt_dt,share_15_dt_dt,share_16_dt_dt,share_17_dt_dt,share_18_dt_dt,share_19_dt_dt,share_2_dt_dt,share_20_dt_dt,share_21_dt_dt,share_22_dt_dt,share_23_dt_dt,share_24_dt_dt,share_25_dt_dt,share_26_dt_dt,share_27_dt_dt,share_28_dt_dt,share_29_dt_dt,share_3_dt_dt,share_30_dt_dt,share_31_dt_dt,share_32_dt_dt,share_33_dt_dt,share_34_dt_dt,share_35_dt_dt,share_36_dt_dt,share_37_dt_dt,share_38_dt_dt,share_39_dt_dt,share_4_dt_dt,share_40_dt_dt,share_41_dt_dt,share_42_dt_dt,share_43_dt_dt,share_44_dt_dt,share_45_dt_dt,share_46_dt_dt,share_47_dt_dt,share_48_dt_dt,share_49_dt_dt,share_5_dt_dt,share_50_dt_dt,share_51_dt_dt,share_52_dt_dt,share_53_dt_dt,share_54_dt_dt,share_55_dt_dt,share_56_dt_dt,share_57_dt_dt,share_58_dt_dt,share_59_dt_dt,share_6_dt_dt,share_60_dt_dt,share_61_dt_dt,share_62_dt_dt,share_63_dt_dt,share_64_dt_dt,share_65_dt_dt,share_66_dt_dt,share_67_dt_dt,share_68_dt_dt,share_69_dt_dt,share_7_dt_dt,share_70_dt_dt,share_71_dt_dt,share_72_dt_dt,share_73_dt_dt,share_74_dt_dt,share_75_dt_dt,share_76_dt_dt,share_77_dt_dt,share_78_dt_dt,share_79_dt_dt,share_8_dt_dt,share_9_dt_dt
0,61058,34,86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,89.456,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,89.456,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,8311,18,86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,39.96,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,39.96,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,64081,43,86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,67.472,,,,,,,,,,,,,,,,,,10.32,,,,,,,,,,,,,,,134.076,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,211.868,,,,,0.318462,,,,,,,,,,,,,,,,,,0.04871,,,,,,,,,,,,,,,0.632828,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,218005,12,86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,779.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,779.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,159729,34,86,,,,,0.196,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.196,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,193.096,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,193.096,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
df.shape

(240069, 327)

In [24]:
df = df.iloc[: 1000,]

In [25]:
X = df[[col for col in df.columns if col not in ['hash_inn', 'okved2', 'region']]]
X = X.fillna(0)
y = df[['okved2']]

In [26]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=SEED)

In [27]:
model = LGBMClassifier()

In [None]:
%%time
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)
y_pred = model.predict(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
f1 = f1_score(y_test, y_pred)

In [None]:
classification_report(y, y_pred)

In [29]:
pred = cross_val_predict(LGBMClassifier(), X, y, cv=3, n_jobs=-1, verbose=3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    7.6s finished


In [30]:
print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         5
           4       0.29      0.27      0.28        49
           5       0.00      0.00      0.00         3
           6       0.19      0.19      0.19        21
           7       0.00      0.00      0.00         5
           8       0.15      0.10      0.12        30
           9       0.10      0.06      0.08        16
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00        20
          12       0.31      0.61      0.41       254
          14       0.29      0.36      0.32        56
          15       0.00      0.00      0.00         4
          16       0.00      0.00      0.00        12
          18       0.00      0.00      0.00        11
          19       0.00      0.00      0.00         3
          20       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
