In [0]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [0]:
from shutil import unpack_archive
unpack_archive('NLP_Datac2476d7.zip')

In [0]:
def write_submission(pred, filename):
  sub = pd.DataFrame()
  test_df = pd.read_csv('NLP_Data/test.csv')
  sub['ID'] = test_df.ID
  sub['overall'] = pred
  sub.to_csv(filename, index=False)

In [0]:
train = pd.read_csv('NLP_Data/train.csv')
test = pd.read_csv('NLP_Data/test.csv')

In [0]:
train.score_1.fillna(4.0, inplace=True)
train.score_2.fillna(5.0, inplace=True)
train.score_3.fillna(5.0, inplace=True)
train.score_4.fillna(5.0, inplace=True)
train.score_5.fillna(4.0, inplace=True)

for col in train.columns[-7:-2]:
  train[col] = np.floor(train[col])
  train[col] = train[col].astype(int)
  test[col] = test[col].astype(int)

In [0]:
train['advice_mgt_avail'] = train.advice_to_mgmt.isna().astype(int)
test['advice_mgt_avail'] = test.advice_to_mgmt.isna().astype(int)

In [0]:
train['loca_avail'] = train.location.isna().astype(int)
test['loca_avail'] = test.location.isna().astype(int)

In [0]:
train_min = train[['Place','status', 'advice_mgt_avail', 'loca_avail', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'score_6']].copy()
test_min = test[['Place','status', 'advice_mgt_avail', 'loca_avail', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'score_6']].copy()

In [0]:
y = train.overall.astype(int)

In [0]:
train_min.dtypes, len(train_min.columns)

(Place               object
 status              object
 advice_mgt_avail     int64
 loca_avail           int64
 score_1              int64
 score_2              int64
 score_3              int64
 score_4              int64
 score_5              int64
 score_6              int64
 dtype: object, 10)

In [0]:
cat_cols = [0, 1, 2, 3, 4, 5, 6, 7, 8]

In [0]:
X_train, X_val, y_train, y_val = train_test_split(train_min, y, stratify=y,
                                                  test_size=0.2,
                                                  random_state=42)

In [0]:
clf = CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1', task_type='GPU',
                         use_best_model=True)

In [0]:
clf.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=cat_cols)

In [0]:
clf = CatBoostClassifier(iterations=250, loss_function='MultiClass', eval_metric='TotalF1', task_type='GPU')
clf.fit(train_min, y, cat_features=cat_cols)

In [0]:
pd.DataFrame(clf.feature_importances_, index=clf.feature_names_, columns=['Importance']).sort_values(by='Importance', ascending=False)

Unnamed: 0,Importance
score_1,17.541444
score_5,15.500956
score_3,14.185633
score_2,11.946469
score_4,11.238628
Place,9.572715
score_6,7.753991
advice_mgt_avail,6.228728
loca_avail,3.281891
status,2.749547


In [0]:
from sklearn.ensemble import BaggingClassifier

In [0]:
clf_cat = CatBoostClassifier(loss_function='MultiClass', task_type='GPU', verbose=False, **best_params_cat, cat_features=cat_cols)

In [0]:
bag_cat = BaggingClassifier(clf_cat, n_estimators=15)
bag_cat.fit(train_min, y)

BaggingClassifier(base_estimator=<catboost.core.CatBoostClassifier object at 0x7f200c1a1d30>,
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=15, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [0]:
preds = bag_cat.predict(test_min)

In [0]:
write_submission(preds, 'CB_bag15.csv')
#LB 0.39088