References:<br>
* https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense/notebook


# Setup

In [None]:
import time
from datetime import datetime

#measure notebook running time
start_time = time.time()

%matplotlib inline

import os, warnings
import numpy as np 
from numpy.random import seed
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
from math import factorial

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix, precision_score,recall_score, f1_score, classification_report, accuracy_score
from sklearn.cluster import KMeans

sns.set(style='white', context='notebook', palette='deep', rc={'figure.figsize':(10,8)})
print("loaded ...")

In [None]:
# Reproducibility
def set_seed(sd=13):
    seed(sd)
    np.random.seed(sd)
    os.environ['PYTHONHASHSEED'] = str(sd)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

RandomSeed = 13    
set_seed(RandomSeed)

In [None]:
%%time
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')
test_data = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')
submit_data = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv')

Well balanced set ...

In [None]:
ax = sns.countplot(data=train_data, x='target');
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right");

## Preparing sets

In [None]:
def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

In [None]:
LE = LabelEncoder()
TARGET = LE.fit_transform(train_data.target)
TRAIN = train_data.drop(['row_id','target'], axis = 1)
TEST = test_data.drop(['row_id'], axis = 1)

Formula taken from: https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense/notebook

In [None]:
%%time
TRAIN = pd.DataFrame({col: ((TRAIN[col] + bias_of(col)) * 1000000).round().astype(int) for col in TRAIN})
TEST = pd.DataFrame({col: ((TEST[col] + bias_of(col)) * 1000000).round().astype(int) for col in TEST})

In [None]:
GROUPED = TRAIN.copy()
GROUPED['target'] = train_data.target
GROUPED = GROUPED.groupby("target").mean()
GROUPED

In [None]:
F = pd.melt(GROUPED, value_vars = GROUPED.columns, ignore_index = False)
F = F.reset_index()
F

In [None]:
%%time
def plot(x,y, **kwargs):
    sns.lineplot(x=x,y=y);
    
g = sns.FacetGrid(F, col = 'target', col_wrap=5, sharex=True, sharey=True, height = 6);
g = g.map(plot, "variable", "value");
g.set(xticklabels=[]);

One example

In [None]:
TRAIN.iloc[0,:-1].plot()

Greatest common divisor

In [None]:
TRAIN['GCD'] = TRAIN.apply(np.gcd.reduce, axis = 1)
TEST['GCD'] = TEST.apply(np.gcd.reduce, axis = 1)
# duplicates = TRAIN.duplicated()
# TRAIN = TRAIN[~duplicates]
# TARGET = TARGET[~duplicates]

Clustering

In [None]:
# %%time
# kmeans = KMeans(n_clusters = 10, random_state=RandomSeed)
# TRAIN['cluster'] = kmeans.fit_predict(TRAIN)
# TEST['cluster'] = kmeans.fit_predict(TEST)
# scaler = StandardScaler()
# TRAIN_CD = kmeans.fit_transform(TRAIN)
# TRAIN_CD = scaler.fit_transform(TRAIN_CD)
# TRAIN_CD = pd.DataFrame(TRAIN_CD, columns = [f"Centroid_{i}" for i in range(TRAIN_CD.shape[1])])
# TEST_CD = kmeans.fit_transform(TEST)
# TEST_CD = scaler.fit_transform(TEST_CD)
# TEST_CD = pd.DataFrame(TEST_CD, columns = [f"Centroid_{i}" for i in range(TRAIN_CD.shape[1])])
# TRAIN = TRAIN.join([TRAIN_CD])
# TEST = TEST.join([TEST_CD])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(TRAIN, TARGET, test_size = 0.25, random_state = RandomSeed, stratify=TARGET)

# Models

## Random Forest

In [None]:
rf_params = {
    'n_jobs':-1,
    'random_state': RandomSeed,
    'n_estimators': 300,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto',
    'max_samples': None
}

In [None]:
rf_grid = {
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf':[1],
    'max_features': ['auto'],
    'max_samples': [None, 0.9],
    'n_estimators': [300],
}

In [None]:
# %%time
# rf_clf = RandomForestClassifier(**rf_params)
# rf_grid_clf = GridSearchCV(rf_clf, rf_grid, cv=3, scoring= "f1_micro")
# rf_grid_clf.fit(X_train, y_train)
# print(rf_grid_clf.best_estimator_)
# print(rf_grid_clf.best_params_)
# print(rf_grid_clf.best_score_)

In [None]:
# print("Reference score:",0.9639976772695013, 
#       " {'max_depth': None, 'max_features': 'auto', 'max_samples': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300} ","\n")
# rf_scores = pd.DataFrame(rf_grid_clf.cv_results_['params'])
# rf_scores['results'] = rf_grid_clf.cv_results_['mean_test_score']
# rf_scores['std'] = rf_grid_clf.cv_results_['std_test_score']
# rf_scores = rf_scores.sort_values('results', ascending=False)
# rf_scores

In [None]:
%%time
rf_model = RandomForestClassifier(**rf_params)
rf_model.fit(X_train, y_train)
rf_train_score = rf_model.score(X_train, y_train)
rf_accuracy = rf_model.score(X_test, y_test)
print("Train: {:.2f} %".format(rf_train_score * 100))
print("Test: {:.2f} %".format(rf_accuracy*100))
print('Overfit: {:.2f} %'.format((rf_train_score-rf_accuracy)*100))

In [None]:
%%time
train_pred = rf_model.predict(X_train)
val_pred = rf_model.predict(X_test)
print(classification_report(y_test, val_pred))

In [None]:
cm = confusion_matrix(y_test, val_pred, normalize = 'pred')
cm_train = confusion_matrix(y_train, train_pred, normalize = 'pred')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(26,13))
disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels= LE.classes_);
disp_train.plot(ax=ax1, values_format='.1%', xticks_rotation='vertical');
disp_train.ax_.set_title('Train set', {'fontsize':20});

disp_test = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= LE.classes_);
disp_test.plot(ax=ax2, values_format='.1%', xticks_rotation='vertical');
disp_test.ax_.set_title('Validation set',{'fontsize':20});
fig.suptitle('Random Forest Classifier', fontsize=32);

---

---

## Extra Trees

In [None]:
ET_parameters = {
    'random_state': RandomSeed,
    'bootstrap': True,
    'n_estimators': 300,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 0.8,
}

In [None]:
ex_param_grid = {"max_depth": [None], 
                 'max_features':[0.8], 
                 'n_estimators': [300],
                 'min_samples_split': [2],
                 'min_samples_leaf': [1],
                 'bootstrap': [True]
                }

In [None]:
# %%time
# ETC_clf = ExtraTreesClassifier(**ET_parameters)
# etc_gs = GridSearchCV(estimator = ETC_clf, param_grid = ex_param_grid, scoring='f1_micro', n_jobs=-1,verbose = 10, cv=3)
# etc_gs.fit(X_train, y_train)

# print("Best score:", etc_gs.best_score_)
# print("Best params:", etc_gs.best_params_)
# print("Best estimator:", etc_gs.best_estimator_)

In [None]:
# print("best score:", 0.9867933333333333,
#      "{'max_depth': None, 'max_features': 0.8, 'n_estimators': 300} \n")
# etc_scores = pd.DataFrame(etc_gs.cv_results_['params'])
# etc_scores['results'] = etc_gs.cv_results_['mean_test_score']
# etc_scores['std'] = etc_gs.cv_results_['std_test_score']
# etc_scores = etc_scores.sort_values('results', ascending=False)
# etc_scores

In [None]:
%%time
ETC_model = ExtraTreesClassifier(**ET_parameters)
ETC_model.fit(X_train, y_train)
ETC_train_score = ETC_model.score(X_train, y_train)
ETC_accuracy = ETC_model.score(X_test, y_test)
print("Train: {:.2f} %".format(ETC_train_score*100))
print("Test: {:.2f} %".format(ETC_accuracy*100))
print('Overfit: {:.2f} %'.format((ETC_train_score-ETC_accuracy)*100))

In [None]:
%%time
train_pred = ETC_model.predict(X_train)
val_pred = ETC_model.predict(X_test)
print(classification_report(y_test, val_pred))

In [None]:
cm = confusion_matrix(y_test, val_pred, normalize = 'pred')
cm_train = confusion_matrix(y_train, train_pred, normalize = 'pred')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(26,13))
disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels= LE.classes_);
disp_train.plot(ax=ax1, values_format='.1%', xticks_rotation='vertical');
disp_train.ax_.set_title('Train set', {'fontsize':20});

disp_test = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= LE.classes_);
disp_test.plot(ax=ax2, values_format='.1%', xticks_rotation='vertical');
disp_test.ax_.set_title('Validation set',{'fontsize':20});
fig.suptitle('Extra Trees Classifier', fontsize=32);

# Predictions

In [None]:
%%time
etc_predictions = LE.inverse_transform(ETC_model.predict(TEST).astype('int'))
rf_predictions = LE.inverse_transform(rf_model.predict(TEST).astype('int'))

## Soft Voting Classifier
using:<br>
* Extra Trees
* RF

In [None]:
etc_proba = ETC_model.predict_proba(TEST)
rf_proba = rf_model.predict_proba(TEST)

In [None]:
all_proba_mean = np.mean(np.array([rf_proba,etc_proba]), axis = 0)

In [None]:
pd.options.display.float_format = '{:,.3f}'.format
PROBA = pd.DataFrame(all_proba_mean, index = submit_data.row_id, columns = LE.classes_)
PROBA.head(10)

In [None]:
vc_predictions = LE.inverse_transform(np.argmax(all_proba_mean, axis = 1))

# Compare

In [None]:
compare = submit_data.copy().drop("target", axis=1)
compare['ETC'] = etc_predictions
compare['RF'] = rf_predictions
compare['VC'] = vc_predictions
compare.head(10)

# Submission

In [None]:
#submit_data.target = etc_predictions
#submit_data.target = rf_predictions

#best
submit_data.target = vc_predictions
submit_data.head(10)

In [None]:
#output
submit_data.to_csv('submission.csv', index=False)
print("Submission was successfully saved!")

In [None]:
end_time = time.time()
print("Notebook run time: {:.1f} minutes. Finished at {}".format((end_time - start_time)/60, datetime.now()) )