## Extra Trees Classifier Optimization Using Optuna Hyperparam 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import optuna
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

### Loading the Train and Test Datasets into a Dataframe.

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

### EDA

In [None]:
train[:5]

In [None]:
train.describe()

In [None]:
train['target'].value_counts()

#### Insights

* Total 10 different classes of bacterias
* Classes are balanced so need of sampling the data

#### Looking for a Duplicate rows if any 

In [None]:
cols = [col for col in test.columns if col not in ('row_id')]
train.drop_duplicates(subset = cols, keep = 'first',inplace = True)

In [None]:
train.shape,test.shape

In [None]:
features = [col for col in train.columns if col not in ['target']]
len(features)

In [None]:
train[:2]

#### Checking the intersection between Train and Test

In [None]:

comb = pd.merge(train, test, how = 'inner', on = cols)
#dictionary initialize
# train_test_map = {}
# for i in range(len(comb)):
#     train_test_map[comb.loc[i]['row_id_y']] = merge.loc[i]['row_id_x']

In [None]:
comb[:3]

In [None]:
comb.shape
# for i in range(len(comb)):
#     print(comb.loc[i]['row_id_y'])

In [None]:
train_test_map = {}
for i in range(len(comb)):
    train_test_map[comb.loc[i]['row_id_y']] = comb.loc[i]['row_id_x']

In [None]:
len(train_test_map)

In [None]:
# print(train_test_map)

In [None]:
cols = [col for col in train.columns if 'target' not in col]
# print(cols)
train['COUNT'] = train.groupby(cols)['A0T0G0C10'].transform('size')
test['COUNT'] = test.groupby(cols)['A0T0G0C10'].transform('size')

In [None]:
# ignore = ['target']
features = [col for col in train.columns if col not in ['target']]

In [None]:
# def create_features(df):
#     """
#     Created multiple features...
#     """    
#     df['F_sum'] = df[features].sum(axis = 1)
#     df['F_min'] = df[features].min(axis = 1)
#     df['F_max'] = df[features].max(axis = 1)    
#     df['F_std'] = df[features].std(axis = 1)
#     df['F_mad'] = df[features].mad(axis = 1)
#     df['F_var'] = df[features].var(axis = 1)
#     df['F_mean'] = df[features].mean(axis = 1)
#     df['F_positive'] = df.select_dtypes(include='float64').gt(0).sum(axis=1)
    
#     return df

#### Data Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
target_encoder = LabelEncoder()
train['target_enc'] = target_encoder.fit_transform(train['target'])

In [None]:
train[:2]

In [None]:
X = train[features]
y = train['target_enc']

In [None]:
X.shape,y.shape

In [None]:
N_SPLITS = 10
folds = StratifiedKFold(n_splits = N_SPLITS, shuffle = True)

In [None]:
from sklearn.model_selection import StratifiedKFold

N_SPLITS = 10
folds = StratifiedKFold(n_splits = N_SPLITS, shuffle = True)


n_estimators = 128
max_depth = 64
min_samples_split = 3
min_samples_leaf = 1
criterion = 'gini'

scores  = []
y_probs = []

for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):  
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    model = ExtraTreesClassifier(n_estimators = n_estimators,
                                 max_depth = max_depth,
                                 min_samples_split = min_samples_split,
                                 min_samples_leaf = min_samples_leaf,
                                 criterion = criterion,
                                 random_state = 69,
                                 n_jobs = -1)
    model.fit(X_train, y_train)
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print("Fold:", fold, "Accuracy:", valid_score)
    scores.append(valid_score)
    y_probs.append(model.predict_proba(test[features]))

In [None]:
print("Mean accuracy score:", np.array(scores).mean())

#### Optuna Model Configuration For Optimizing Parameters 

In [None]:
N_SPLITS = 10
folds = StratifiedKFold(n_splits = N_SPLITS, shuffle = True)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 8, 2048)
    max_depth = trial.suggest_int("max_depth", 4, 2048)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 16)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 8)
    criterion = trial.suggest_categorical("criterion", ['gini', 'entropy'])
    
    clf = ExtraTreesClassifier(n_estimators = n_estimators,
                               max_depth = max_depth,
                               min_samples_split = min_samples_split, 
                               min_samples_leaf = min_samples_leaf,
                               criterion = criterion,
                               random_state = 69,
                              )
    
    clf.fit(X_train, y_train)
    return clf.score(X_valid, y_valid)
# study = optuna.create_study(direction = "maximize")
# study.optimize(objective, n_trials = 30)


In [None]:
# parameters = study.best_params
# parameters

#### Now Again Train Model using Optimized Parameters Using Optuna

In [None]:
from sklearn.model_selection import StratifiedKFold

N_SPLITS = 10
folds = StratifiedKFold(n_splits = N_SPLITS, shuffle = True)


n_estimators = 2373
max_depth = 3691
min_samples_split = 3
min_samples_leaf = 1
criterion = 'gini'

scores  = []
y_probs = []

for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):  
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    model = ExtraTreesClassifier(n_estimators = n_estimators,
                                 max_depth = max_depth,
                                 min_samples_split = min_samples_split,
                                 min_samples_leaf = min_samples_leaf,
                                 criterion = criterion,
                                 random_state = 69,
                                 n_jobs = -1)
    model.fit(X_train, y_train)
    
    valid_pred = model.predict(X_valid)
    valid_score = accuracy_score(y_valid, valid_pred)
    
    print("Fold:", fold, "Accuracy:", valid_score)
    scores.append(valid_score)
    y_probs.append(model.predict_proba(test[features]))

In [None]:
test.shape[0]

In [None]:
print("Mean accuracy score:", np.array(scores).mean())

In [None]:
y_prob = sum(y_probs) / len(y_probs)
print(y_prob)
y_prob += np.array([0, 0, 0.03, 0.036, 0, 0, 0, 0.027, 0, 0])
print(y_prob)
y_pred_tuned = target_encoder.inverse_transform(np.argmax(y_prob, axis=1))
print(y_pred_tuned)
pd.Series(y_pred_tuned, index=test.index).value_counts().sort_index() / len(test) * 100
# test[:3]

In [None]:
for key in train_test_map:
    sub.loc[sub[sub['row_id'] == key].index.to_list(),'target'] = train.loc[train[train['row_id'] == train_test_map[key]].index.tolist()[0],'target']

In [None]:
sub["target"] = y_pred_tuned
sub.to_csv("submission.csv", index=False)
sub

In [None]:
len(sub)