# 1. Load libraries and data

In [None]:
#algebra
import pandas as pd
pd.options.display.float_format = '{:,.12f}'.format
#I want to see all features from the dataset given. But be careful, sometimes the output can be too large!
pd.options.display.max_rows = None 
pd.set_option('max_colwidth', 260)
import numpy as np
from math import factorial

#data preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder

from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import VarianceThreshold

#models
import optuna
from xgboost.sklearn import XGBClassifier

#metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

#visual
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import ticker as tkr
import plotly.express as px
from textwrap import wrap

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col = 0)
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col = 0)
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

### 1.1. Constants

In [None]:
#epochs = 60 #last layer's config required 60 epochs
#batch_size = 2048
random_state = 42
#ntrain = train.shape[0]
#ntest = test.shape[0]
folds = 11

# 2. EDA

In [None]:
print('The train data has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
print('The test data has {} rows and {} columns'.format(test.shape[0], test.shape[1]))

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.head()

### 2.1. Missing values

In [None]:
missing_train = pd.concat([train.isna().sum().sort_values(ascending = False), train.dtypes], axis=1, keys=['Total', 'Type'])
missing_train[missing_train['Total'] > 0]

In [None]:
missing_test = pd.concat([test.isna().sum().sort_values(ascending = False), test.dtypes], axis=1, keys=['Total', 'Type'])
missing_test[missing_test['Total'] > 0]

### 2.2. Check if target variable is balanced or not

In [None]:
fig, axes = plt.subplots(figsize = (15, 10))
sns.histplot(data = train, x = train['target'], fill = True)
plt.xticks(rotation = 70)
plt.show()

In [None]:
train['target'].value_counts()

### 2.3. Check duplicates in train
From https://www.kaggle.com/sfktrkl/tps-feb-2022

In [None]:
duplicates_train = train.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

In [None]:
train.drop_duplicates(keep = 'first', inplace = True)
duplicates_train = train.duplicated().sum()

In [None]:
train = train.reset_index()
test = test.reset_index()

### 2.4. Feature engineering
From https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense

In [None]:
elements = [e for e in train.columns if e != 'row_id' and e != 'target']

def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

train_i = pd.DataFrame({col: ((train[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})
test_i = pd.DataFrame({col: ((test[col] + bias_of(col)) * 1000000).round().astype(int) for col in elements})

In [None]:
#Color scheme picked from https://www.kaggle.com/usharengaraju/tensorflow-decision-forests-w-b
train_i.loc[:, :].describe().T.style.bar(subset=['mean'], color="#e9c46a")\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='Pastel1')

Definitely there's a pattern in data means', I should investigate it later.

#### Add greatest common divisor columns

In [None]:
train['gcd'] = np.gcd.reduce(train_i[elements], axis = 1)
test['gcd'] = np.gcd.reduce(test_i[elements], axis = 1)

#### Encode bacterias:

In [None]:
# Convert the 10 bacteria names to the integers 0 .. 9
le = LabelEncoder()
train['target_num'] = le.fit_transform(train['target'])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train_i.head()

In [None]:
test_i.head()

In [None]:
train_i['row_id'] = train['row_id']
test_i['row_id'] = test['row_id']
train_i['target'] = train['target']

In [None]:
train_i['target_num'] = train['target_num']

#### And the picture:

In [None]:
for scale in np.sort(train['gcd'].unique()):
    # Compute the PCA
    pca = PCA(whiten = True, random_state = random_state)
    pca.fit(train_i[elements][train['gcd'] == scale])

    # Transform the data so that the components can be analyzed
    Xt_tr = pca.transform(train_i[elements][train['gcd'] == scale])
    Xt_te = pca.transform(test_i[elements][test['gcd'] == scale])

    # Plot a scattergram, projected to two PCA components, colored by classification target
    plt.figure(figsize = (15,15))
    plt.scatter(Xt_tr[:,0], Xt_tr[:,1], c = train.target_num[train['gcd'] == scale], s = 1)
    plt.title(f"{1000000 // scale} decamers ({(train['gcd'] == scale).sum()} samples with gcd = {scale})")
    plt.show()

In [None]:
train_i['max'] = train_i.drop(columns = ['target', 'row_id', 'target_num']).max(axis = 1)
train_i['std'] = train_i.drop(columns = ['target', 'row_id', 'target_num']).std(axis = 1)
test_i['max'] = test_i.drop(columns = ['row_id']).max(axis = 1)
test_i['std'] = test_i.drop(columns = ['row_id']).std(axis = 1)

### 2.4.1. Additional - difference between Escherichia fergusonii and Escherichia coli

In [None]:
fergusonii = train_i[train_i['target'] == 'Escherichia_fergusonii']
coli = train_i[train_i['target'] == 'Escherichia_coli']

### 2.5. Data distribution

In [None]:
nrows = 69
ncols = 4
fig, axes = plt.subplots(nrows, ncols, figsize = (25, 175))
axes = axes.flatten()
labels = ['Train', 'Test']
for idx, ax in enumerate(axes):
    sns.kdeplot(data = train_i, x = train_i.iloc[:, idx], fill = True, ax = ax, color = '#5047ff', label = labels[0])
    sns.kdeplot(data = test_i, 
                x = test_i.iloc[:, idx], fill = True, ax = ax, color = '#ffa647', label = labels[1])
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.set_title('\n'.join(wrap(train_i.columns[idx])), 
                 loc = 'center', weight = 'bold', fontsize = 10, wrap = True)
    #plt.text(f'f{idx}')

fig.legend(loc = 'upper center', ncol = 2, borderaxespad = 0., labels = labels)
fig.tight_layout()
plt.show()

### 2.6. Code for pairplots

In [None]:
train_i_sample = train_i.sample(frac = 0.1)
train_i_sample.shape

In [None]:
train_i_sample['target'].value_counts()

#### It's too expensive to make a pairplot for 287 cols at once, so I divide it into pieces, look at each pair in order to find some insides.

In [None]:
start = 0
step = 2
end = step
#fig, ax = plt.subplots(figsize = (25, 25))
sns.pairplot(train_i_sample[train_i_sample.iloc[:, start:end].columns.tolist() + ['target']], hue = 'target',
             height = 5,
             aspect = 1)

In [None]:
start = end
end += step
sns.pairplot(train_i_sample[train_i_sample.iloc[:, start:end].columns.tolist() + ['target']], hue = 'target',
             height = 5,
             aspect = 1)

# 3. Tuning hyperparameters

In [None]:
def objective(trial):
    X = train_i.drop(columns = ['target', 'target_num'])
    Y = train_i['target']
    X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, train_size = 0.85, random_state = 42)

    param = {
        #'learning_rate': trial.suggest_float('learning_rate', 0.38, 0.39),
        'subsample': trial.suggest_float('subsample', 0.95, 0.999),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.6),
        'eta': trial.suggest_float('eta', 0.35, 0.45),
        'alpha': trial.suggest_float('alpha', 0.01, 0.99),
        'max_depth': trial.suggest_int('max_depth', 4, 12)
        #'used_ram_limit': '16gb',
    }

    #if param['eval_metric'] == 'auc':
        #param['objective'] = 'multi:softprob'
        
    #if param['grow_policy'] == 'Lossguide':
    #    param['max_leaves'] = trial.suggest_int('max_leaves',1, 10)

    model = XGBClassifier(**param,
                          random_state = 42,
                          objective = "multi:softprob", #multi:softprob if eval_metric = 'auc'
                          nthread = -1,
                          #iterations = 100,
                          #logging_level = 'Silent',
                          eval_metric = 'auc', #try merror, mlogloss or auc 
                          tree_method = 'gpu_hist')

    model.fit(X_train, Y_train, eval_set = [(X_validation, Y_validation)], early_stopping_rounds = 200, verbose = 0)

    preds = model.predict(X_validation)
    '''if param['eval_metric'] == 'auc':
        accuracy = roc_auc_score(Y_validation, preds)
    else:
        accuracy = accuracy_score(Y_validation, preds)'''
    accuracy = accuracy_score(Y_validation, preds) #try roc_auc_score
    return accuracy
        
if __name__ == '__main__':
    study = optuna.create_study(direction = 'maximize')
    study.optimize(objective, n_trials = 600, timeout = 7200, gc_after_trial = True) #set small timeout to save time for model and save the function code

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

Last try:
* early_stopping_rounds = 200
* objective = "multi:softprob"
* eval_metric = 'auc'

Optuna output
* Number of finished trials: 35
* Best trial:
*   Value: **0.9729555352438303**
*   Params: 
    * subsample: 0.9948234285094822
    * colsample_bytree: 0.4903348888738705
    * eta: 0.39418000146045795
    * alpha: 0.20745559213060122
    * max_depth: 7
    
accuracy_score(Y_validation, Y_predicted)
* -

Public Score
* -

In [None]:
optuna.visualization.plot_slice(study)

## 3.1. Save trials

In [None]:
trials = study.trials_dataframe()
trials.to_csv('trials.csv', index = False)

# 4. Models

In [None]:
X = train_i.drop(columns = ['target', 'target_num'])
Y = train_i['target']
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, train_size = 0.85, random_state = 42)
X_test = test_i#.drop(columns = ['row_id'])

### 4.1. Single

model = XGBClassifier(random_state = 42,
                      objective = "multi:softmax", #multi:softprob if eval_metric = 'auc'
                      nthread = -1,
                      tree_method = 'gpu_hist',
                      eval_metric = 'mlogloss', #try merror, mlogloss or auc
                      **study.best_trial.params
)

In [None]:
#model.fit(X_train, Y_train, eval_set = [(X_validation, Y_validation)], early_stopping_rounds = 400, verbose = 0)

In [None]:
#Y_predicted = model.predict(X_validation)

In [None]:
#accuracy_score(Y_validation, Y_predicted)

In [None]:
#balanced_accuracy_score(Y_validation, Y_predicted) #just for comparison

In [None]:
#conf = confusion_matrix(Y_validation, Y_predicted)

fig, axes = plt.subplots(figsize = (25, 10))
ax = sns.heatmap(conf / np.sum(conf), 
                 annot = True,
                 fmt = '.2%', 
                 cmap = 'Blues')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(train['target'].unique().tolist())
ax.yaxis.set_ticklabels(train['target'].unique().tolist())
plt.xticks(rotation = 42, ha = 'right')
plt.yticks(rotation = 0)
plt.show()

### 4.2. CV

In [None]:
test_preds = pd.DataFrame(columns = range(0, folds))

cv = KFold(n_splits = folds, 
                     shuffle = True, 
                     random_state = random_state)

X = train_i.drop(columns = ['target', 'target_num'])
Y = train_i['target']
#X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, train_size = 0.85, random_state = 42)

    
        
for i, (train_index, val_index) in enumerate(cv.split(X, Y)):

    print("-" * 50)
    print(f"Fold {i + 1} of {folds}, ensemble")
    
    x_train, x_val = X.values[train_index], X.values[val_index]
    y_train, y_val = Y.values[train_index], Y.values[val_index]
        
    eval_set = [(x_val, y_val)]
    
    ens_model = XGBClassifier(random_state = 42,
                      objective = "multi:softprob", #multi:softprob if eval_metric = 'auc'
                      nthread = -1,
                      tree_method = 'gpu_hist',
                      eval_metric = 'auc', #try merror, mlogloss or auc
                      **study.best_trial.params
    )
    
    ens_model.fit(x_train, y_train,
                  eval_set = eval_set, 
                  early_stopping_rounds = 200,
                  verbose = 0)
    
    train_preds = ens_model.predict(x_train)    
    val_preds = ens_model.predict(x_val)
    
    print('Accuracy => {}'.format(accuracy_score(y_val, val_preds)))
    
    test_preds[i] = ens_model.predict(X_test).tolist()
    #test_preds[i] = test_preds[i].map(lambda x: x[0])

print("-" * 50)

In [None]:
test_preds['target'] = test_preds.mode(axis = 1)[0]
test_preds.head()

### 4.3. If no CV:

In [None]:
#predictions = model.predict(X_test).tolist()

# 5. Submission

In [None]:
submission['target'] = test_preds['target']
#submission['target'] = predictions#['target']

### 5.1. Some plots

In [None]:
fig, axes = plt.subplots(figsize = (15, 10))
sns.histplot(data = submission, x = submission['target'], fill = True)
plt.xticks(rotation = 42, ha = 'right')
plt.show()

In [None]:
# Plot repeats 2.2
fig, axes = plt.subplots(figsize = (15, 10))
sns.histplot(data = train_i, x = train_i['target'], fill = True)
plt.xticks(rotation = 42, ha = 'right')
plt.show()

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index = False)

In [None]:
coli.shape, fergusonii.shape

In [None]:
means = pd.DataFrame()
means.index = pd.DataFrame(fergusonii.loc[:, :].describe().T).index
means['ferg_mean'] = pd.DataFrame(fergusonii.loc[:, :].describe().T)['mean']
means['coli_mean'] = pd.DataFrame(coli.loc[:, :].describe().T)['mean']
means.head()

In [None]:
fig, axes = plt.subplots(figsize = (25, 15))
sns.lineplot(data = means)
plt.xticks(rotation = 42, ha = 'right')
plt.show()