In [1]:
import os

from catboost import CatBoostClassifier, Pool,sum_models
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

In [2]:
from settings import data_dir, result_dir, log_dir

In [3]:
experiment="Catboost"
train=pd.read_csv(os.path.join(data_dir,"Train.csv"))
test=pd.read_csv(os.path.join(data_dir,"Test.csv"))

In [4]:
#COUNTRY
tab_countries=train['country'].value_counts()
dominant_countries=tab_countries[tab_countries>50].index.values
train['country']=train['country'].map(lambda x: x if x in dominant_countries else "Others")
test['country']=test['country'].map(lambda x: x if x in dominant_countries else "Others")


In [5]:
predictors=['country', 'age_group', 'travel_with', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'first_trip_tz']

cat_predictors=[ 'age_group', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance',  'first_trip_tz']

embedding_predictors=['country','travel_with','purpose','main_activity','info_source']


In [6]:
X_train=train[predictors].copy()
label_enc=LabelEncoder()
y_train=label_enc.fit_transform(train['cost_category'])

X_test=test[predictors].copy()
ids_test=test['Tour_ID']

In [7]:
for i,col in enumerate(X_train.columns):
    if (col in cat_predictors) or (col in embedding_predictors):
        # cat_features.append(i)
        if col == 'country':
            mode="Others"
        else:
            mode=X_train[col].value_counts().index[0]
        # mode="Missing"
        X_train[col].fillna(mode,inplace=True)
        X_test[col].fillna(mode,inplace=True)
    else:
        median=X_train[col].median()
        X_train[col].fillna(median,inplace=True)
        X_test[col].fillna(median,inplace=True)

In [8]:
pool_test=Pool(X_test,cat_features=cat_predictors+embedding_predictors,)


In [9]:
models = []
test_preds=[]
pseudo_labels=[]
pseudo_ids=[]
k_folds=KFold(n_splits=5,shuffle=True,random_state=123)
for train_ids,valid_ids in k_folds.split(X_train,y_train):
    X_train_fold=X_train.iloc[train_ids,:]
    X_valid_fold=X_train.iloc[valid_ids,:]
    y_train_fold=y_train[train_ids]
    y_valid_fold = y_train[valid_ids]
    pseudo_ids.append(train['Tour_ID'].iloc[valid_ids].values)
    pool_fold_train = Pool(X_train_fold, y_train_fold, cat_features=cat_predictors+embedding_predictors)
    pool_fold_valid = Pool(X_valid_fold,y_valid_fold, cat_features=cat_predictors+embedding_predictors)
    model_ = CatBoostClassifier(loss_function='MultiClass',
                                learning_rate=0.03,depth=6,
                                n_estimators=1000,
                                l2_leaf_reg=12,
                                bagging_temperature=1,
                                train_dir=os.path.join(log_dir,f'{experiment}'),
                           verbose=100,task_type="GPU",early_stopping_rounds=10)
    # model_.set_params(**best_params )
    model_.fit(pool_fold_train,
              eval_set=pool_fold_valid,
               use_best_model=True)
    models.append(model_)
    test_preds.append(model_.predict_proba(pool_test))
    pseudo_labels.append(model_.predict(pool_fold_valid,prediction_type="Probability"))


0:	learn: 1.7592956	test: 1.7589224	best: 1.7589224 (0)	total: 18ms	remaining: 17.9s
100:	learn: 1.1251586	test: 1.1253032	best: 1.1253032 (100)	total: 1.06s	remaining: 9.47s
200:	learn: 1.0708640	test: 1.0808922	best: 1.0808922 (200)	total: 2.05s	remaining: 8.14s
300:	learn: 1.0465227	test: 1.0683054	best: 1.0683054 (300)	total: 3.01s	remaining: 7s
400:	learn: 1.0307506	test: 1.0629432	best: 1.0629432 (400)	total: 4.06s	remaining: 6.06s
500:	learn: 1.0176500	test: 1.0600194	best: 1.0600012 (499)	total: 4.95s	remaining: 4.93s
600:	learn: 1.0060907	test: 1.0583061	best: 1.0582780 (595)	total: 6.14s	remaining: 4.07s
bestTest = 1.057990389
bestIteration = 626
Shrink model to first 627 iterations.
0:	learn: 1.7591398	test: 1.7587794	best: 1.7587794 (0)	total: 7.11ms	remaining: 7.1s
100:	learn: 1.1193380	test: 1.1343350	best: 1.1343350 (100)	total: 820ms	remaining: 7.3s
200:	learn: 1.0665217	test: 1.0972780	best: 1.0972780 (200)	total: 1.86s	remaining: 7.4s
300:	learn: 1.0430396	test: 1.084

In [10]:
for m in models:
    print(m.best_score_," Trees:",m.tree_count_)

{'learn': {'MultiClass': 1.0025658183599027}, 'validation': {'MultiClass': 1.0579903889578268}}  Trees: 627
{'learn': {'MultiClass': 0.9992651870145222}, 'validation': {'MultiClass': 1.0727553575047284}}  Trees: 707
{'learn': {'MultiClass': 1.0071067761102668}, 'validation': {'MultiClass': 1.0956029884237368}}  Trees: 549
{'learn': {'MultiClass': 0.9940780933383991}, 'validation': {'MultiClass': 1.066993678334403}}  Trees: 754
{'learn': {'MultiClass': 1.0060094377955082}, 'validation': {'MultiClass': 1.064327789364361}}  Trees: 667


In [12]:
pseudo_labels_df=pd.DataFrame(np.concatenate(pseudo_labels,axis=0),columns=label_enc.classes_)
pseudo_labels_df['Tour_ID']=np.concatenate(pseudo_ids,axis=0)
test_predictions=np.stack(test_preds).mean(axis=0)

test_df=pd.DataFrame(test_predictions)
test_df['Test_ID']=ids_test
test_df_long=pd.melt(test_df,id_vars='Test_ID',var_name='category',value_name='prob')
test_df_long['category']=label_enc.inverse_transform(test_df_long['category'].astype('int'))
test_df_wide=pd.pivot_table(test_df_long,values='prob',index='Test_ID',columns='category')

test_df_wide.to_csv(os.path.join(result_dir,"Catboost baseline.csv"))
pseudo_labels_df.to_csv(os.path.join(result_dir,"Pseudo labels -boosting.csv"),index=False)