# Training

In [1]:
%load_ext ipyslack
%slack_setup /home/sandpiturtle/slack_notif_setup.txt

In [4]:
import numpy as np
import pandas as pd
import hyperopt

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import stats
from tqdm import tqdm_notebook, tnrange

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from catboost import Pool, CatBoostClassifier, cv, CatboostIpythonWidget

from itertools import combinations

In [5]:
X      = pd.read_csv('cleaned/train.csv')
y      = pd.read_csv('data/labels.csv')
X_test = pd.read_csv('cleaned/test.csv')

In [6]:
test_ids = np.array(X_test.id)

In [7]:
drop = [
    'id',
    'amount_tsh', 'gps_height', 'permit',
    'region', 'lga', 'latitude', 'longitude',
    'reduced_funder', 'reduced_installer',
    'month', 'rain_season', 'construction_period',
]

X.drop(drop, axis=1, inplace=True)
X_test.drop(drop, axis=1, inplace=True)

In [8]:
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y.status_group)

In [9]:
features = X.columns
features.tolist()

['funder',
 'installer',
 'basin',
 'region_code',
 'district_code',
 'ward',
 'population',
 'public_meeting',
 'scheme_management',
 'construction_year',
 'extraction_type',
 'extraction_type_class',
 'management',
 'management_group',
 'payment_type',
 'water_quality',
 'quantity',
 'source',
 'source_class',
 'waterpoint_type',
 'estimated_age']

In [10]:
ctrs = [ 
    'region', 'lga', 'reduced_funder','reduced_installer','month','rain_season','construction_period',
    'construction_year','funder','installer','basin','region_code','district_code','ward','public_meeting',
    'scheme_management','permit','extraction_type','extraction_type_class','management','management_group',
    'payment_type','water_quality','quantity','source','source_class','waterpoint_type',
]

ctrs_indexes = []
for i, v in enumerate(features):
    if v in ctrs:
        ctrs_indexes.append(i)

In [11]:
seed = 1234

In [None]:
X, y = np.array(X), np.array(y)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

for train_ind, valid_ind in sss.split(X, y):
    X_train, y_train           = X[train_ind], y[train_ind]
    X_validation, y_validation = X[valid_ind], y[valid_ind]

## Naive CatBoost

In [None]:
cbc_valid = CatBoostClassifier(
    learning_rate=0.1,
    loss_function='MultiClass', 
    calc_feature_importance=True,
    eval_metric='Accuracy',
    auto_stop_pval=0.01,
    use_best_model=True,
    train_dir='train/cbc_valid',
    random_seed=seed
).fit(
    X_train, 
    y_train, 
    cat_features=ctrs_indexes, 
    eval_set=(X_validation, y_validation)
)

In [None]:
%%slack_notify {out}
print('Train: {:.4f}'.format(
    cbc_valid.score(X_train, y_train)
))
print('Valid: {:.4f}'.format(
    cbc_valid.score(X_validation, y_validation)
))

## Calculate feature importances

In [None]:
importances = cbc.feature_importance_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. %s (%.4f)" % (f + 1, features[indices[f]], importances[indices[f]]))

In [None]:
imp_map = np.vectorize(lambda x: importances[x])
plt.rcParams.update({'font.size': 14})
fig, ax = plt.subplots(figsize=(15, 8))
sns.barplot(x=imp_map(indices), y=features[indices], color='r', ax=ax);

## Make submission

In [None]:
cbc = CatBoostClassifier(
    learning_rate=0.1,
    loss_function='MultiClass', 
    eval_metric='Accuracy',
    train_dir='train/cbc_X'
).fit(X, y, cat_features=ctrs_indexes)

In [None]:
%%slack_notify {out}
print('Full: {:.4f}'.format(
    cbc.score(X, y)
))

In [None]:
def submit(pred, name='ans'):
    y_pred = y_encoder.inverse_transform(pred.astype(int))
    ans = pd.DataFrame({'id': test_ids, 'status_group': y_pred.ravel()})
    ans.to_csv('submissions/' + name + '.csv', index=False)

In [None]:
submit(cbc.predict(X_test))

Max score: 0.8210

## Model tuning

### GridSearch

In [None]:
def gscv(name, values):
    params = {
        name: values
    }
    
    clf = CatBoostClassifier(
        loss_function='MultiClass', 
        eval_metric='Accuracy',
        train_dir='train/cbc_' + name
    )

    gs = GridSearchCV(
        clf,
        param_grid=params, 
        n_jobs=-1,
        verbose=1,
        fit_params={ 'cat_features': ctrs_indexes }
    ).fit(X, y);

    return gs

In [None]:
best = gscv('learning_rate', [0.05, 0.1, 0.15])

In [None]:
%%slack_notify {out}
print('Best param: ', best.best_params_)
print('Best score: ', best.best_score_)

### Hyperopt

In [22]:
def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=0.1, #params['learning_rate'],
        #ctr_description=params['ctr_description'],
        
        loss_function='MultiClass', 
        eval_metric='Accuracy',
        use_best_model=True,
        random_seed=seed,
        train_dir='train/cbc_hyper'
    )
    
    cv_data = cv(
        model.get_params(),
        Pool(X, y, cat_features=ctrs_indexes)
    )
    best_accuracy = np.max(cv_data["b'Accuracy'_test_avg"])
    
    return 1 - best_accuracy # as hyperopt minimises

In [None]:
ctr_descr = ['Borders', 'CounterMax', 'Buckets']

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    #'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-1, 5e-1),
    #'ctr_description': hyperopt.hp.choice(
    #    'ctr_description',
    #    list(combinations(ctr_descr, 1)) + list(combinations(ctr_descr, 2))
    #)
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=10,
    trials=trials,
    verbose=10
)

In [None]:
%%slack_notify {out}
print(best)

In [None]:
predictions = []

for i in tnrange(5):
    clf = CatBoostClassifier(
        learning_rate=0.1,
        loss_function='MultiClass', 
        eval_metric='Accuracy',
        train_dir='train/cbc_'+str(i),
        random_seed=i, 
    ).fit(X, y, cat_features=ctrs_indexes)
    predictions.append(clf.predict(X_test))

In [None]:
p = stats.mode(np.array(predictions), axis=0)[0].ravel().astype(int)
y_pred = y_encoder.inverse_transform(p)
ans = pd.DataFrame({'id': test_ids, 'status_group': y_pred.ravel()})
ans.to_csv('ans.csv', index=False)