In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import lightgbm as lgb

In [2]:
train = pd.read_csv('data/reduce_train.csv')
new_col_names = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train.columns]
train.columns = new_col_names

In [3]:
_target = 'accuracy_group'
_id = 'installation_id'

In [4]:
X_cols = [x for x in train.columns if x not in [_target, _id]]

In [5]:
X = train[X_cols]
y = train[_target]

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [74]:
X_fit, X_val, y_fit, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [9]:
params = {"objective" : "multiclass",
          "n_estimators": 50000,
          "num_class" : 4,
          "num_leaves" : 128,
          "max_depth": -1,
          "learning_rate" : 0.01,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.9,  # colsample_bytree
          "bagging_freq" : 2,        # subsample_freq
          "bagging_seed" : 42,
          "verbosity" : -1}

In [10]:
lgb_model = lgb.LGBMClassifier(**params)

In [33]:
lgb_model.fit(X_train,
              y_train,
              eval_set=[(X_test, y_test)],
              verbose=100,
              early_stopping_rounds=20
             )

Training until validation scores don't improve for 20 rounds
[100]	valid_0's multi_logloss: 1.06401
[200]	valid_0's multi_logloss: 1.01501
[300]	valid_0's multi_logloss: 0.99938
[400]	valid_0's multi_logloss: 0.996432
Early stopping, best iteration is:
[389]	valid_0's multi_logloss: 0.996196


LGBMClassifier(bagging_fraction=0.9, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               feature_fraction=0.9, importance_type='split',
               learning_rate=0.01, max_depth=-1, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=50000,
               n_jobs=-1, num_class=4, num_leaves=128, objective='multiclass',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
               verbosity=-1)

In [29]:
y_preds = lgb_model.predict(X_test)

In [31]:
confusion_matrix(y_test, y_preds)

array([[ 819,   61,    6,  372],
       [ 163,  109,   13,  452],
       [  77,   43,   16,  547],
       [ 164,   55,   15, 2395]], dtype=int64)

In [32]:
lgb_model.predict_proba(X_test)

array([[0.08467721, 0.09246488, 0.09029381, 0.7325641 ],
       [0.53891018, 0.09908131, 0.08052134, 0.28148717],
       [0.69066304, 0.09046516, 0.07877207, 0.14009973],
       ...,
       [0.05778397, 0.07632554, 0.07659405, 0.78929645],
       [0.95094789, 0.03337777, 0.007962  , 0.00771234],
       [0.05465113, 0.3003672 , 0.13582663, 0.50915504]])

In [34]:
print('Mejor score con todas las variables:', lgb_model.best_score_['valid_0']['multi_logloss'])

Mejor score con todas las variables: 0.9854644567757107


In [20]:
df_imp = pd.read_csv('importances/20200113_PermutationImportance_RF.csv')
df_imp.feature = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df_imp.feature]

In [21]:
df_imp.head()

Unnamed: 0,feature,Importance
0,session_title,0.021406
1,3393b68b,0.002261
2,accumulated_accuracy_group,0.00147
3,7525289a,0.001394
4,Chest_Sorter__Assessment__3121,0.001319


In [22]:
ft_imp = df_imp[df_imp.Importance > 0]['feature'].tolist()

In [34]:
lgb_model.fit(X_train[ft_imp],
              y_train,
              eval_set=[(X_test[ft_imp], y_test)],
              verbose=100,
              early_stopping_rounds=20
             )

Training until validation scores don't improve for 20 rounds
[100]	valid_0's multi_logloss: 1.06285
[200]	valid_0's multi_logloss: 1.01465
[300]	valid_0's multi_logloss: 0.998728
[400]	valid_0's multi_logloss: 0.995743
Early stopping, best iteration is:
[385]	valid_0's multi_logloss: 0.995391


LGBMClassifier(bagging_fraction=0.9, bagging_freq=5, bagging_seed=42,
               boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               feature_fraction=0.9, importance_type='split',
               learning_rate=0.01, max_depth=-1, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=50000,
               n_jobs=-1, num_class=4, num_leaves=128, objective='multiclass',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
               verbosity=-1)

In [35]:
y_preds = lgb_model.predict(X_test[ft_imp])

In [36]:
confusion_matrix(y_test, y_preds)

array([[ 806,   67,   10,  375],
       [ 167,  110,   16,  444],
       [  72,   42,   17,  552],
       [ 156,   62,   20, 2391]], dtype=int64)

In [37]:
print('Mejor score con las variables mas importantes:', lgb_model.best_score_['valid_0']['multi_logloss'])

Mejor score con las variables mas importantes: 0.9953909891982207


1 contra el resto (transformación a clasificación binaria)

In [28]:
params = {"objective" : "binary",
          "n_estimators": 50000,
          "num_leaves" : 128,
          "max_depth": -1,
          "learning_rate" : 0.01,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.9,  # colsample_bytree
          "bagging_freq" : 2,        # subsample_freq
          "bagging_seed" : 42,
          "verbosity" : -1}

In [29]:
lgb_model = lgb.LGBMClassifier(**params)

In [83]:
df_sub = pd.DataFrame({_id: train.iloc[X_test.index][_id],
                       _target: train.iloc[X_test.index][_target]})

In [88]:
for i in range(4):
    print('Entrenando con variable clave:', i)
    
    y_train_bin = np.where(y_fit == i, 1, 0)
    y_val_bin = np.where(y_val == i, 1, 0)
    y_test_bin = np.where(y_test == i, 1, 0)

    lgb_model.fit(X_fit[ft_imp],
                  y_train_bin,
                  eval_set=[(X_val[ft_imp], y_val_bin)],
                  verbose=100,
                  early_stopping_rounds=40
                 )

    y_preds = lgb_model.predict(X_test[ft_imp])
    y_preds_proba = lgb_model.predict_proba(X_test[ft_imp])[:,1]
    df_sub['clave_{}'.format(i)] = y_preds_proba

    print('Matriz de confusion:')
    print(confusion_matrix(y_test_bin, y_preds))
    print('')

Entrenando con variable clave: 0
Training until validation scores don't improve for 40 rounds
[100]	valid_0's binary_logloss: 0.421763
[200]	valid_0's binary_logloss: 0.38973
[300]	valid_0's binary_logloss: 0.379803
[400]	valid_0's binary_logloss: 0.37769
Early stopping, best iteration is:
[412]	valid_0's binary_logloss: 0.377633
Matriz de confusion:
[[1286   61]
 [ 208  214]]

Entrenando con variable clave: 1
Training until validation scores don't improve for 40 rounds
[100]	valid_0's binary_logloss: 0.365215
Early stopping, best iteration is:
[139]	valid_0's binary_logloss: 0.363556
Matriz de confusion:
[[1504    2]
 [ 262    1]]

Entrenando con variable clave: 2
Training until validation scores don't improve for 40 rounds
[100]	valid_0's binary_logloss: 0.375313
Early stopping, best iteration is:
[69]	valid_0's binary_logloss: 0.374752
Matriz de confusion:
[[1550    0]
 [ 219    0]]

Entrenando con variable clave: 3
Training until validation scores don't improve for 40 rounds
[100]	

In [89]:
df_sub.head()

Unnamed: 0,installation_id,accuracy_group,clave_0,clave_1,clave_2,clave_3
17553,fc4ad96c,3,0.062137,0.117844,0.11518,0.72615
5659,4c53711a,0,0.369194,0.128821,0.112929,0.333262
5954,4f7942c8,0,0.684356,0.194679,0.09786,0.204033
733,08987c08,0,0.657136,0.09996,0.079618,0.16974
10290,8fabc729,3,0.050192,0.108812,0.132427,0.643717


In [97]:
df_sub['max_pred'] = df_sub[['clave_0', 'clave_1', 'clave_2', 'clave_3']].max(axis=1)

In [98]:
df_sub.head()

Unnamed: 0,installation_id,accuracy_group,clave_0,clave_1,clave_2,clave_3,max_pred
17553,fc4ad96c,3,0.062137,0.117844,0.11518,0.72615,0.72615
5659,4c53711a,0,0.369194,0.128821,0.112929,0.333262,0.369194
5954,4f7942c8,0,0.684356,0.194679,0.09786,0.204033,0.684356
733,08987c08,0,0.657136,0.09996,0.079618,0.16974,0.657136
10290,8fabc729,3,0.050192,0.108812,0.132427,0.643717,0.643717


In [99]:
def return_solution(x, y, z, t, max_pred):
    if x == max_pred:
        return 0
    elif y == max_pred:
        return 1
    elif z == max_pred:
        return 2
    elif t == max_pred:
        return 3
    else:
        return None

In [105]:
df_sub['final_pred'] = df_sub.apply(lambda x: return_solution(x['clave_0'], x['clave_1'], x['clave_2'], x['clave_3'], x['max_pred']), axis=1)

In [106]:
confusion_matrix(y_test, df_sub['final_pred'])

array([[269,  14,   1, 138],
       [ 60,  38,   2, 163],
       [ 15,  13,   1, 190],
       [ 49,  15,   1, 800]], dtype=int64)