# Training

In [None]:
%load_ext ipyslack
%slack_setup /Users/nik/slack_notif_setup.txt

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import stats
from tqdm import tqdm_notebook, tnrange

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier

from catboost import Pool, CatBoostClassifier, cv, CatboostIpythonWidget

In [None]:
X      = pd.read_csv('data/cleaned_train.csv')
y      = pd.read_csv('data/labels.csv')
X_test = pd.read_csv('data/cleaned_test.csv')

In [None]:
test_ids = np.array(X_test.id)

In [None]:
drop = [
    'id', 'lga', 
    'latitude', 'longitude',
    #'district_code',
    'rain_season', 'days_since_the_beginning',
    'construction_year',
]

X.drop(drop, axis=1, inplace=True)
X_test.drop(drop, axis=1, inplace=True)

In [None]:
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y.status_group)

In [None]:
features = X.columns
#features.tolist()

In [None]:
cat_feat = np.where(X.apply(pd.Series.nunique) < 200)[0].tolist()
#features[cat_feat].tolist()

In [None]:
#w = CatboostIpythonWidget('train/')
#w.update_widget()

In [None]:
X, y = np.array(X), np.array(y)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3)

for train_ind, valid_ind in sss.split(X, y):
    X_train, y_train = X[train_ind], y[train_ind]
    X_validation, y_validation = X[valid_ind], y[valid_ind]

In [None]:
cbc = CatBoostClassifier(
    iterations=2000,
    loss_function='MultiClass', 
    calc_feature_importance=True,
    eval_metric='Accuracy',
    auto_stop_pval=0.01,
    use_best_model=True,
    verbose=True,
    train_dir='train/cbc_single'
).fit(
    X_train, 
    y_train, 
    cat_features=cat_feat, 
    eval_set=(X_validation, y_validation)
)

In [None]:
%%slack_notify {out}
print('Train: {:.4f}'.format(
    cbc.score(X_train, y_train)
))
print('Valid: {:.4f}'.format(
    cbc.score(X_validation, y_validation)
))

In [None]:
importances = cbc.feature_importance_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. %s (%.4f)" % (f + 1, features[indices[f]], importances[indices[f]]))

In [None]:
imp_map = np.vectorize(lambda x: importances[x])
plt.rcParams.update({'font.size': 14})
fig, ax = plt.subplots(figsize=(15, 8))
sns.barplot(x=imp_map(indices), y=features[indices], color='r', ax=ax);

In [None]:
y_pred = y_encoder.inverse_transform(cbc.predict(X_test).astype(int))
ans = pd.DataFrame({'id': test_ids, 'status_group': y_pred.ravel()})
ans.to_csv('submissions/ans.csv', index=False)

In [None]:
for col in features[cat_feat].tolist():
    e = LabelEncoder()
    X[col] = e.fit_transform(X[col])
    X_test[col] = e.transform(X_test[col])

In [None]:
forest = RandomForestClassifier(
    n_estimators=1200,
    verbose=10, 
    n_jobs=-1
).fit(X, y);

In [None]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
mapper = np.vectorize(lambda x: features[x])
plt.xticks(range(X.shape[1]), mapper(indices), rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
p = forest.predict(X_test)
y_pred = y_encoder.inverse_transform(p)
ans = pd.DataFrame({'id': test_ids, 'status_group': y_pred.ravel()})
ans.to_csv('submissions/ans.csv', index=False)

In [None]:
predictions = []

for i in tnrange(5):
    clf = CatBoostClassifier(loss_function='MultiClass',
                             random_seed=i, 
                             train_dir='train/cbc'+str(i)
                            )
    clf.fit(X, y, cat_features=cat_feat)
    predictions.append(clf.predict(X_test))

In [None]:
p = stats.mode(np.array(predictions), axis=0)[0].ravel().astype(int)
y_pred = y_encoder.inverse_transform(p)
ans = pd.DataFrame({'id': test_ids, 'status_group': y_pred.ravel()})
ans.to_csv('ans.csv', index=False)