In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Data Preprocessing

## Data loading and merging

In [None]:
train = pd.read_csv('/kaggle/input/telstra-recruiting-network/train.csv.zip')
test = pd.read_csv('/kaggle/input/telstra-recruiting-network/test.csv.zip')
feature = pd.read_csv('/kaggle/input/telstra-recruiting-network/log_feature.csv.zip')
event = pd.read_csv('/kaggle/input/telstra-recruiting-network/event_type.csv.zip')
resource = pd.read_csv('/kaggle/input/telstra-recruiting-network/resource_type.csv.zip')
severity = pd.read_csv('/kaggle/input/telstra-recruiting-network/severity_type.csv.zip')

In [None]:
df_list = [train,test,feature,event,resource,severity]
for df in df_list:
    print(df.columns[-1],':',len(df))

In [None]:
# for the training data
merge_1 = pd.merge(train,feature) 
merge_2 = pd.merge(merge_1,event) 
merge_3 = pd.merge(merge_2,resource) 
merge_4 = pd.merge(merge_3,severity) 
print(merge_4.shape)

In [None]:
merge_4.isna().sum(axis=0)

In [None]:
merge_4.head(5)

In [None]:
merge_4.drop_duplicates(subset = 'id', inplace = True) 
merge_4.shape

In [None]:
train = merge_4.set_index(merge_4.id).drop('id',axis = 1)
train.head(5)

In [None]:
# for the testing data 
merge_5 = pd.merge(test,feature) 
merge_6 = pd.merge(merge_5,event) 
merge_7 = pd.merge(merge_6,resource) 
merge_8 = pd.merge(merge_7,severity) 
print(merge_8.shape)

In [None]:
merge_8.isna().sum(axis=0)

In [None]:
merge_8.head(5)

In [None]:
merge_8.drop_duplicates(subset = 'id', inplace = True) 
print(merge_8.shape)

In [None]:
merge_8.head(5)

In [None]:
test = merge_8.set_index(merge_8.id).drop('id',axis = 1)
test.head(5)

In [None]:
train.head(5)

In [None]:
# remove duplicate prefixes for the trainning set 
# the need for category features of the lgb algorithm
removal = train.iloc[:,[0,2,4,5,6]].apply(lambda i:i.apply(lambda x:x.replace(x,x.split(' ')[-1])))
removal.head()

In [None]:
removal.dtypes

In [None]:
train = pd.concat([removal,train.iloc[:,[3,1]]],axis = 1)
train.head()

In [None]:
# remove duplicate prefixes for the testing set
removal = test.iloc[:,[0,1,3,4,5]].apply(lambda i:i.apply(lambda x:x.replace(x,x.split(' ')[-1])))
removal.head()

In [None]:
removal.dtypes

In [None]:
test = pd.concat([removal,test.iloc[:,2]],axis = 1)
test.head(5)

## Exploratory data analysis

In [None]:
train.info()

In [None]:
train.dtypes

Although the location、log_feature、event_type are categorical variables,
they own much levels, which is hard to visualize.

In [None]:
# plot the pie chart for the target variables
import matplotlib.pyplot as plt
plt.figure(figsize=(3,3))
plt.pie(x = train.fault_severity.value_counts().values,
        labels = train.fault_severity.value_counts().index,
        colors = ('aliceblue','lightsteelblue','pink'),autopct = "%.2f%%")
plt.title('fault_severity')
plt.legend() 
plt.show()

In [None]:
# plot the bar_chart for the less-level categorical variables
import seaborn as sns
less_level = train.iloc[:,[3,4,6]]
plt.figure(figsize=(15,5))
count = 1
for col in less_level.columns[:-1]:
    plt.subplot(1,2,count)
    temp = pd.crosstab(less_level.fault_severity,less_level[col])
    temp1 = temp.T.stack().reset_index()
    sns.barplot(temp1[col], temp1[0], hue = temp1.fault_severity, palette='PuBu')
    count += 1

In [None]:
# plot the histograms for the numerical variable
plt.figure(figsize = (5,5))
sns.distplot(train[train.fault_severity == 0]['volume'], kde=False, label='NoFault', bins=3)
sns.distplot(train[train.fault_severity == 1]['volume'], kde=False, label='Several Faults', bins=3)
sns.distplot(train[train.fault_severity == 2]['volume'], kde=False, label='Serious Faults', bins=3)
plt.legend()

## Missing value detection

In [None]:
train.isna().sum(axis=0)

In [None]:
test.isna().sum(axis=0)

## Outliers detection and removing

In [None]:
sns.boxplot( x = train.volume,orient = 'v',palette = 'PuBu')

In [None]:
from scipy import stats
x = np.abs(stats.zscore(train.volume)) < 3
train.volume = np.where(x,train.volume,np.nan)
train.isna().sum(axis=0)

In [None]:
train.volume = train.volume.replace(np.nan, train.volume.sum()/(len(train.volume) - (train.volume.isna()).sum()))
train.isna().sum(axis=0)

## Skewed variables detection 

In [None]:
# although the large range didn't affect the performance of the tree model, we conduct the detection.
train.describe()

In [None]:
train.skew()
# the distribution of the input variables didn't affect the performance of the tree model.
# no numerical variable is highly skewed(skewness>10).

## Correlation analysis

In [None]:
plt.figure(figsize = (3, 3))
sns.heatmap(train.corr(), annot = True, vmax=1, vmin=-1, cmap='YlGnBu_r')
plt.show()
# the muliticollineity doesn't affect the performance of the boosting tree.

## Transform the data types for lgb

In [None]:
train.dtypes

In [None]:
train.iloc[:,0:5] = train.iloc[:,0:5].astype('category')
train.dtypes

# Data partition

In [None]:
from sklearn.model_selection import train_test_split
x = train.iloc[:,0:-1]
y = train.fault_severity
x_train, x_val, y_train, y_val= train_test_split(x, y, test_size = 0.2, random_state = 1)

In [None]:
from collections import Counter
Counter(y_train)
count_y = Counter(y_train)
plt.figure(figsize=(3,3))
plt.pie(x = count_y.values(), labels = count_y.keys(), 
        colors = ('lightsteelblue','aliceblue','pink'), autopct = "%.2f%%")
plt.title('Class for training data')
plt.legend() 
plt.show()

# Model building

to control the consistency of the default value of the parameters for the two algorithms

In [None]:
%%time
from lightgbm import LGBMClassifier as lgbc
from catboost import CatBoostClassifier as cbc
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import OneHotEncoder

gbm_b = lgbc(objective = 'multiclass', max_depth = 6, reg_lambda = 3.0, random_state = 1)
gbm_b = gbm_b.fit(X = x_train, y = y_train, eval_set = (x_val,y_val), early_stopping_rounds = 50, verbose = 10)
print('\t')
print('bestTest = ',gbm_b.best_score_['valid_0']['multi_logloss'])
print('bestIteration = ',gbm_b.best_iteration_)
print('\t')

cat_b = cbc(objective = 'MultiClass', learning_rate = 0.1, n_estimators = 100, random_state = 1)
cat_b = cat_b.fit(X = x_train, y = y_train, eval_set = (x_val,y_val), 
                 cat_features = np.where(x_train.dtypes != np.float)[0], 
                 early_stopping_rounds = 50, verbose = 10)

ensembles = [gbm_b,cat_b]
TRAIN_ACC = []
VAL_ACC = []
Multi_Logloss = []
for model in ensembles:
    train_acc = model.score(x_train,y_train)
    TRAIN_ACC.append(train_acc)
    
    y_pred = model.predict(x_val)
    
    val_acc = accuracy_score(y_val,y_pred)
    VAL_ACC.append(val_acc)
    
    y_pred = y_pred.reshape(-1,1)
    y_true = np.array(y_val).reshape(-1,1)
    one_hot = OneHotEncoder(sparse = False)
    y_true = one_hot.fit_transform(y_true)
    y_pred = one_hot.fit_transform(y_pred)
    multi_logloss = log_loss(y_true, y_pred)
    Multi_Logloss.append(multi_logloss)
    
ind=['train_acc','val_acc','multi_logloss']
col=['gbm_b','cat_b']
summary=pd.DataFrame(np.vstack((TRAIN_ACC,VAL_ACC,Multi_Logloss)),columns = col,index = ind)
print(summary)

the performance of the two algorithms is similar, hence we conduct tuning for the two models.

## lgb

### Trial 1 - BO

Search all the paramaters through Bayesian Optimization

In [None]:
import lightgbm as lgb
from bayes_opt import BayesianOptimization
train_set = lgb.Dataset(data = x_train, label = y_train)
def lgb_eval(learning_rate, n_estimators, max_depth, num_leaves, min_data_in_leaf, bagging_fraction, bagging_freq, feature_fraction,
             lambda_l1, lambda_l2):
    params = {'objective': 'multiclass', 'num_class': 3, 'learning_rate': 0.1, 
              'seed': 1, 'force_col_wise': True, 'feature_pre_filter': False, 'verbose' : -1 }
    params['learning_rate'] = learning_rate
    params['n_estimators'] = round(n_estimators)
    params['max_depth'] = round(max_depth)
    params['num_leaves'] = round(num_leaves)
    params['min_data_in_leaf'] = round(min_data_in_leaf)
    params['bagging_freq'] = round(bagging_freq)
    params['bagging_fraction'] = min(bagging_fraction,1.0)
    params['feature_fraction'] = min(feature_fraction,1.0)
    params['lambda_l1'] = lambda_l1
    params['lambda_l2'] = lambda_l2
    cv_result = lgb.cv(params, train_set, nfold = 5, early_stopping_rounds = 50, 
                       verbose_eval = 50, eval_train_metric = True)
    return -(min(cv_result['valid multi_logloss-mean']))

lgb_BO_1 = BayesianOptimization(lgb_eval,     
    {'learning_rate': (0.05,0.2),
     'n_estimators': (10,500),
     'max_depth': (3,8),
     'max_depth': (3,8),
     'num_leaves': (7, 255),
     'min_data_in_leaf': (18,22),
     'bagging_fraction':(0.8,1),
     'bagging_freq':(1,5),
     'feature_fraction': (0.8,1),
     'lambda_l1': (0.1,3), 
     'lambda_l2': (0.1,3)
}, random_state = 1)
lgb_BO_1.maximize()
lgb_BO_1.max

In [None]:
print('the multi-logloss improvement:', gbm_b.best_score_['valid_0']['multi_logloss'] - abs(lgb_BO_1.max['target']))

### Trial 2 - BO+

Use Bayesian optimization to search for all parameters except learning_rate and n_estimators and
initialize learning rate as 0.1 and n_estimators as 5000

In [None]:
# lr = 0.1, n = 5000
def lgb_eval(max_depth, num_leaves, min_data_in_leaf, bagging_fraction, bagging_freq, feature_fraction,
             lambda_l1, lambda_l2):
    params = {'objective': 'multiclass', 'num_class': 3, 'seed': 1,
              'learning_rate': 0.1,  'force_col_wise': True, 'feature_pre_filter': False, 'verbose' : -1 }
    params['max_depth'] = round(max_depth)
    params['num_leaves'] = round(num_leaves)
    params['min_data_in_leaf'] = round(min_data_in_leaf)
    params['bagging_freq'] = round(bagging_freq)
    params['bagging_fraction'] = min(bagging_fraction,1.0)
    params['feature_fraction'] = min(feature_fraction,1.0)
    params['lambda_l1'] = lambda_l1
    params['lambda_l2'] = lambda_l2
    cv_result = lgb.cv(params, train_set, nfold = 5, num_boost_round = 5000, early_stopping_rounds = 50, 
                       verbose_eval = 50, eval_train_metric = True)
    return -(min(cv_result['valid multi_logloss-mean']))

lgb_BO_2 = BayesianOptimization(lgb_eval,     
    {'max_depth': (3,8),
     'num_leaves': (7, 255),
     'min_data_in_leaf': (18,22),
     'bagging_fraction':(0.8,1),
     'bagging_freq':(1,5),
     'feature_fraction': (0.8,1),
     'lambda_l1': (0.1,3), 
     'lambda_l2': (0.1,3)
}, random_state = 1)

lgb_BO_2.maximize()
lgb_BO_2.max

#### learning rate - GSCV

use gridCV to search the learning rate

In [None]:
%%time
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
folds = KFold(n_splits= 5, shuffle= True, random_state= 1)
params = {'learning_rate':[0.005,0.01,0.05,0.08,0.1,0.2]}
gbm_lr = lgbc(objective = 'multiclass', random_state = 1, 
              num_leaves = 220, max_depth = 8, min_data_in_leaf = 21, bagging_freq = 5, 
              bagging_fraction = 1.0, feature_fraction = 0.8,
              lambda_l1 = 0.3547216179898903, lambda_l2 = 3.0)
gs_lr = GridSearchCV(gbm_lr, params, scoring = 'neg_log_loss', cv = folds, n_jobs = -1, verbose = 2, return_train_score = True )
gs_lr.fit(x_train,y_train) 
print(gs_lr.best_params_,gs_lr.best_score_)
gs_lr_results = pd.DataFrame(gs_lr.cv_results_)
gs_lr_scores = gs_lr_results[['param_learning_rate','mean_test_score','mean_train_score']]
gs_lr_scores 

#### n_estimators - GSCV

gridCV - for n_estimators with learning_rate = 0.05

In [None]:
%%time
folds = KFold(n_splits = 5, shuffle = True, random_state = 1)
params = {'n_estimators':range(10,201,10)}  
gbm_n = lgbc(objective = 'multiclass', learning_rate = gs_lr.best_params_['learning_rate'],random_state = 1, 
             num_leaves = 220, max_depth = 8, min_data_in_leaf = 21, bagging_freq = 5, 
             bagging_fraction = 1.0, feature_fraction = 0.8,
             lambda_l1 = 0.3547216179898903, lambda_l2 = 3.0)
gs_n = GridSearchCV(gbm_n, params, scoring = 'neg_log_loss', cv = folds, n_jobs = -1, verbose = 2, return_train_score = True )
gs_n.fit(x_train,y_train) 
print(gs_n.best_params_,gs_n.best_score_)
gs_n_results = pd.DataFrame(gs_n.cv_results_)
gs_n_scores = gs_n_results[['param_n_estimators','mean_test_score','mean_train_score']]
gs_n_scores 

Narrow the scope to find more accurate paramater.

In [None]:
%%time
folds = KFold(n_splits= 5, shuffle= True, random_state= 1)
params = {'n_estimators':range(70,90,1)}
gbm_n = lgbc(objective = 'multiclass', learning_rate = gs_lr.best_params_['learning_rate'],random_state = 1, 
             num_leaves = 220, max_depth = 8, min_data_in_leaf = 21, bagging_freq = 5, 
             bagging_fraction = 1.0, feature_fraction = 0.8,
             lambda_l1 = 0.3547216179898903, lambda_l2 = 3.0)
gs_n = GridSearchCV(gbm_n, params, scoring = 'neg_log_loss', cv = folds, n_jobs = -1, verbose = 2, return_train_score = True )
gs_n.fit(x_train,y_train) 
print(gs_n.best_params_,gs_n.best_score_)
gs_n_results = pd.DataFrame(gs_n.cv_results_)
gs_n_scores = gs_n_results[['param_n_estimators','mean_test_score','mean_train_score']]
gs_n_scores 

#### n_estimators - CV

use lgb.cv to search the n_estimators with learning_rate = 0.05

In [None]:
params = {'objective': 'multiclass', 'num_class': 3, 'seed': 1,
          'force_col_wise': True, 'feature_pre_filter': False, 'verbose' : -1,
          'learning_rate': gs_lr.best_params_['learning_rate'], 'num_leaves': 220, 'max_depth': 8, 
          'min_data_in_leaf': 21, 'bagging_freq': 5, 
          'bagging_fraction': 1.0, 'feature_fraction': 0.8,  
          'lambda_l1': 0.3547216179898903, 'lambda_l2': 3.0}
cv_results = lgb.cv(params, train_set, nfold = 5, num_boost_round = 5000, early_stopping_rounds = 50, 
                    verbose_eval = 50, eval_train_metric = True)
cv_summary = pd.DataFrame(cv_results)
print('best n_estimators:', cv_summary.shape[0])
print('best val_logloss score:', cv_summary.iloc[-1,2])

In [None]:
plt.figure(figsize=(20,5))
plt.plot(range(1,cv_summary.shape[0]+1),cv_summary.iloc[:,0],color='lightsteelblue',label='train-logloss')
plt.plot(range(1,cv_summary.shape[0]+1),cv_summary.iloc[:,2],color='pink',label='val-logloss')
plt.legend()
plt.show()

In [None]:
# summary the performance of the lgb
ind = ['lgb_baseline','lgb_BO','lgb_BO_GSCV','lgb_BO_GSCV_CV']
col = ['multi-logloss']
multi_logloss_lgb = [gbm_b.best_score_['valid_0']['multi_logloss'], abs(lgb_BO_1.max['target']), 
                     abs(gs_n.best_score_), cv_summary.iloc[-1,2]]
summary_lgb = pd.DataFrame(multi_logloss_lgb,columns = col,index = ind)
print(summary_lgb)

It seems that searching all the parameters through BO is better.

## cb

### Trial 1

In [None]:
# boosting_type = Ordered、auto_class_weights = Balanced
import catboost as cb
train_pool = cb.Pool(data = x_train, label = y_train, cat_features = np.where(x_train.dtypes != np.float)[0])
def cb_eval(learning_rate, n_estimators, max_depth, reg_lambda):
    params = {'objective': 'MultiClass', 
              'boosting_type': 'Ordered', 'auto_class_weights': 'Balanced', 
              'random_state': 1 }
    params['learning_rate'] = learning_rate
    params['n_estimators'] = round(n_estimators)
    params['max_depth'] = round(max_depth)
    params['reg_lambda'] = reg_lambda
    cv_result = cb.cv(pool = train_pool, params = params, nfold = 5, 
                      early_stopping_rounds = 50, verbose = 50)
    return -(min(cv_result['test-MultiClass-mean']))
        
cb_BO_1 = BayesianOptimization(cb_eval,     
                             {'learning_rate': (0.05,0.2),
                              'n_estimators': (10,500),
                              'max_depth': (4,10),
                              'reg_lambda': (0.1,3)}, random_state = 1)
cb_BO_1.maximize(init_points = 5, n_iter = 5)
cb_BO_1.max
# 0.706790767154976

### Trial 2

In [None]:
# boosting_type = 'Plain'、auto_class_weights = Balanced
def cb_eval(learning_rate, n_estimators, max_depth, reg_lambda):
    params = {'objective': 'MultiClass', 'auto_class_weights': 'Balanced', 'random_state': 1 }
    params['learning_rate'] = learning_rate
    params['n_estimators'] = round(n_estimators)
    params['max_depth'] = round(max_depth)
    params['reg_lambda'] = reg_lambda
    cv_result = cb.cv(pool = train_pool, params = params, nfold = 5, 
                      early_stopping_rounds = 50, verbose = 50)
    return -(min(cv_result['test-MultiClass-mean']))
        
cb_BO_2 = BayesianOptimization(cb_eval,     
                             {'learning_rate': (0.05,0.2),
                              'n_estimators': (10,500),
                              'max_depth': (4,10),
                              'reg_lambda': (0.1,3)}, random_state = 1)
cb_BO_2.maximize(init_points = 5, n_iter = 5)
cb_BO_2.max
# 0.7092479497327548

### Trial 3

In [None]:
# boosting_type = Ordered, auto_class_weights = None = 1
def cb_eval(learning_rate, n_estimators, max_depth, reg_lambda):
    params = {'objective': 'MultiClass', 'boosting_type': 'Ordered', 'random_state': 1 }
    params['learning_rate'] = learning_rate
    params['n_estimators'] = round(n_estimators)
    params['max_depth'] = round(max_depth)
    params['reg_lambda'] = reg_lambda
    cv_result = cb.cv(pool = train_pool, params = params, nfold = 5, 
                      early_stopping_rounds = 50, verbose = 50)
    return -(min(cv_result['test-MultiClass-mean']))
        
cb_BO_3 = BayesianOptimization(cb_eval,     
                             {'learning_rate': (0.05,0.2),
                              'n_estimators': (10,500),
                              'max_depth': (4,10),
                              'reg_lambda': (0.1,3)}, random_state = 1)
cb_BO_3.maximize(init_points = 5, n_iter = 5)
cb_BO_3.max
# 0.6009923381161436

In [None]:
# Keeps the number of optimizations and searches consistent with using LGBM
def cb_eval(learning_rate, n_estimators, max_depth, reg_lambda):
    params = {'objective': 'MultiClass', 'boosting_type': 'Ordered', 'random_state': 1 }
    params['learning_rate'] = learning_rate
    params['n_estimators'] = round(n_estimators)
    params['max_depth'] = round(max_depth)
    params['reg_lambda'] = reg_lambda
    cv_result = cb.cv(pool = train_pool, params = params, nfold = 5, 
                      early_stopping_rounds = 50, verbose = 50)
    return -(min(cv_result['test-MultiClass-mean']))
        
cb_BO_4 = BayesianOptimization(cb_eval,     
                             {'learning_rate': (0.05,0.2),
                              'n_estimators': (10,500),
                              'max_depth': (4,10),
                              'reg_lambda': (0.1,3)}, random_state = 1)
cb_BO_4.maximize(init_points = 15,n_iter = 15)
cb_BO_4.max

In [None]:
ind = ['lgb_baseline','cb_baseline','lgb_BO','lgb_BO_GSCV','lgb_BO_GSCV_CV','cb_BO_1', 'cb_BO_2', 'cb_BO_3', 'cb_BO_4']
col = ['tuned-multi-logloss']
tuned_multi_logloss = [gbm_b.best_score_['valid_0']['multi_logloss'],cat_b.best_score_['validation']['MultiClass'],
                       abs(lgb_BO_1.max['target']), abs(gs_n.best_score_), cv_summary.iloc[-1,2] ,
                       abs(cb_BO_1.max['target']), abs(cb_BO_2.max['target']), abs(cb_BO_3.max['target']), 
                       abs(cb_BO_4.max['target'])]
tuning_records = pd.DataFrame(tuned_multi_logloss,columns = col,index = ind)
print(tuning_records)

From the above table, we can see that the last tuning loss is minimal.
Therefore, we substitute the parameters into the final model, and then train.

In [None]:
cat_f = cbc(objective = 'MultiClass', learning_rate = 0.05716226472904498, n_estimators = 292,
            max_depth = 9, reg_lambda = 1.4979851182199662, random_state = 1)
cat_f = cat_f.fit(X = train_pool, verbose = 10)
train_acc_f = cat_f.score(x_train,y_train)
y_pred = cat_f.predict(x_val)
val_acc_f = accuracy_score(y_val,y_pred)
y_pred = y_pred.reshape(-1,1)
y_true = np.array(y_val).reshape(-1,1)
one_hot = OneHotEncoder(sparse = False)
y_true = one_hot.fit_transform(y_true)
y_pred = one_hot.fit_transform(y_pred)
multi_logloss_f = log_loss(y_true, y_pred)
print('train_acc_final: ',train_acc_f)
print('val_train_acc_final: ',val_acc_f)
print('multi_logloss_final: ',multi_logloss_f)

In [None]:
summary['cat_f'] = [train_acc_f, val_acc_f,multi_logloss_f]
summary

In [None]:
print('The loss of the whole model decreases only by', summary.iloc[2,1]-summary.iloc[2,2])

In [None]:
# re-tune the n_estimators by cross-validation and set the n_estimators as 300
val_pool = cb.Pool(data = x_val, label = y_val, cat_features = np.where(x_val.dtypes != np.float)[0])
cat_f2 = cbc(objective = 'MultiClass', learning_rate = 0.05716226472904498, n_estimators = 300,
             max_depth = 9, reg_lambda = 1.4979851182199662, random_state = 1)
cat_f2 = cat_f2.fit(X = train_pool, eval_set = val_pool, use_best_model = True,
                    early_stopping_rounds = 50, verbose = 10, plot = True)

# Model evaluation

In [None]:
train_acc_f2 = cat_f2.score(x_train,y_train)
y_pred = cat_f2.predict(x_val)
val_acc_f2 = accuracy_score(y_val,y_pred)
y_pred = y_pred.reshape(-1,1)
y_true = np.array(y_val).reshape(-1,1)
one_hot = OneHotEncoder(sparse = False)
y_true = one_hot.fit_transform(y_true)
y_pred = one_hot.fit_transform(y_pred)
multi_logloss_f2 = log_loss(y_true, y_pred)
print('train_acc_final2: ',train_acc_f2)
print('val_train_acc_final2: ',val_acc_f2)
print('multi_logloss_final2: ',multi_logloss_f2)

In [None]:
summary['cat_f2'] = [train_acc_f2, val_acc_f2,multi_logloss_f2]
summary

In [None]:
print('The loss of the whole model decreases by', summary.iloc[2,1]-summary.iloc[2,3])

In [None]:
cat_f2.tree_count_

In [None]:
cat_f2.best_iteration_

In [None]:
cat_f2.best_score_

In [None]:
# plot the feature importance
cat_f2_im = cat_f2.feature_importances_
im_ind = np.argsort(cat_f2_im)[::-1]
for f in range(x_train.shape[1]):
    print(f + 1, x.columns[im_ind[f]], cat_f2_im[im_ind[f]])

In [None]:
x_columns_ = [x.columns[i] for i in im_ind] 
for i in range(x.columns.shape[0]): 
    plt.bar(i, cat_f2_im[im_ind[i]], color='lightsteelblue', align='center') 
    plt.xticks(np.arange(x.columns.shape[0]), x_columns_, rotation=90, fontsize=11) 

# Model prediction

In [None]:
y_pred = cat_f2.predict_proba(test)
y_pred

In [None]:
submission = pd.DataFrame(y_pred,columns=['predict_0', 'predict_1', 'predict_2'])
re_test = test.reset_index()
submission = pd.concat([re_test['id'], submission], axis=1)
submission.to_csv('submission.csv',index = 0)
submission.head(5)