## Introduction

The task of this compeition is to classify 10 different bacteria species using data from a genomic 
analysis technique that has some data compression and data loss. 
The dataset used for this compeition is derived from this [paper](https://www.frontiersin.org/articles/10.3389/fmicb.2020.00257/full).

Submissions are evaluated based on their categorization accuracy.

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings 

from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings('ignore')

## Reading the dataset

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

In [None]:
train_df

## Basic EDA

In [None]:
train_df.info()

In [None]:
train_df.describe()

### Observation:
* Classes are balanced 

## Data Preparation 

In [None]:
train_df.drop(['row_id'], axis=1, inplace=True)
test_df.drop(['row_id'], axis=1, inplace=True)

TARGET = 'target'
FEATURES = [col for col in train_df.columns if col not in [TARGET]]

In [None]:
train_df.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)

## Null Values

In [None]:
train_df.isna().sum(), test_df.isna().sum()

### Observations:
* No NULL VALUES 

### Categorical Fetaures 
Features having less than 25 unique values are considered to be categorical features here.

In [None]:
df = pd.concat([train_df[FEATURES], test_df[FEATURES]], axis=0)

cat_features = [col for col in FEATURES if df[col].nunique() < 25]
cont_features = [col for col in FEATURES if df[col].nunique() >= 25]

del df

### Handling Duplicates 

In [None]:
train_df.shape

In [None]:
train_df.drop_duplicates(keep='first', inplace=True)
train_df.shape

In [None]:
X = train_df[FEATURES]
y = train_df[TARGET]

### Understanding the Feature Importance 

In [None]:
# Extra Tree Forest to analyse the best features 
from sklearn.ensemble import ExtraTreesClassifier

extra_tree_forest = ExtraTreesClassifier()
extra_tree_forest.fit(X, y)
ranked_features = pd.DataFrame(extra_tree_forest.feature_importances_, index=X.columns)
ranked = ranked_features.sort_values(by=0, ascending=False)
ranked = ranked.iloc[0:285]
new_cols = ranked.index

In [None]:
ranked_series = pd.Series(extra_tree_forest.feature_importances_, index=X.columns)
ax = ranked_series.nlargest(285).plot(kind= 'barh', figsize=(30,80))
ax.set_xlabel("Score")
ax.set_ylabel("Features")
plt.show()

### Feature Selection 
* Selecting the best 200 features 

In [None]:
# Feature Selection using Mutual Information 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
sel_features = SelectKBest(mutual_info_classif, k=250)
sel_features.fit(X, y)
keep_columns = X.columns[sel_features.get_support()]
print(keep_columns)

In [None]:
keep_columns

In [None]:
train_df1 = train_df

train_df1 = train_df1[keep_columns]
test_df = test_df[keep_columns]
train_df1['target'] = -1
train_df1['target'] = train_df[TARGET]
train_df = train_df1
del train_df1

In [None]:
test_df

In [None]:
# New Features(after applying KSelectBest)
FEATURES = [col for col in train_df.columns if col not in [TARGET]]

### Basic Feature Engineering 

In [None]:
train_df['mean'] = train_df[FEATURES].mean(axis=1)
train_df['std'] = train_df[FEATURES].std(axis=1)
train_df['max'] = train_df[FEATURES].max(axis=1)
train_df['min'] = train_df[FEATURES].min(axis=1)

test_df['mean'] = test_df[FEATURES].mean(axis=1)
test_df['std'] = test_df[FEATURES].std(axis=1)
test_df['max'] = test_df[FEATURES].max(axis=1)
test_df['min'] = test_df[FEATURES].min(axis=1)

FEATURES.extend(['mean', 'std', 'max', 'min'])

In [None]:
from sklearn.model_selection import StratifiedKFold
train_df['kfold'] = -1
skf = StratifiedKFold(n_splits=5, shuffle=True)
for fold, (train_indicies, valid_indicies) in enumerate(skf.split(X=train_df[FEATURES], y=train_df[TARGET])):
    train_df.loc[valid_indicies, 'kfold'] = fold

In [None]:
train_df.kfold.value_counts()

In [None]:
for i in range(0,5):
    train_df[train_df.kfold == i].target.hist(figsize=(25,5))
    plt.show()

### Modelling - XGBoost Classifier

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train_df[TARGET] = encoder.fit_transform(train_df[TARGET])

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler
prediction = []
score = []

for fold in range (5):
    X_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    X_val = train_df[train_df.kfold == fold].reset_index(drop=True)
    X_test = test_df.copy()

    # dependent variables 
    y_train = X_train[TARGET]
    y_val = X_val[TARGET]

    # independent variables
    X_train = X_train[FEATURES]
    X_val = X_val[FEATURES]

#     scaler = RobustScaler()
#     X_train = scaler.fit_transform(X_train)
#     X_val = scaler.transform(X_val)
#     X_test = scaler.transform(X_test)


    # XGBRegressor moddelling 
    model = XGBClassifier(tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(X_train,y_train,early_stopping_rounds=100,eval_set=[(X_val,y_val)],verbose=False)


    preds_valid = model.predict(X_val)

    #Training model apply the test data and predict the output
    test_predict = model.predict(X_test)
    prediction.append(test_predict)
    accuracy= accuracy_score(y_val,preds_valid)

    #Score 
    score.append(accuracy)
    print(f"fold:{fold},accuracy:{accuracy}")
    
print(np.mean(score),np.std(score))





### Hyperparameters Tunning using Optuna 

In [None]:
import optuna 

def hyp_optimizer(trial):
    fold = 0
    # hyperparameters for XGBoost
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1,7)

    X_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    X_val = train_df[train_df.kfold == fold].reset_index(drop=True)
    # X_test = test_df.copy()

    # dependent variables 
    y_train = X_train[TARGET]
    y_val = X_val[TARGET]

    # independent variables
    X_train = X_train[FEATURES]
    X_val = X_val[FEATURES]

    scaler = RobustScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    # X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


    # XGBClassifier moddelling 
    model = XGBClassifier(
      tree_method='gpu_hist', 
      gpu_id=0, predictor='gpu_predictor',
      n_estimators=1000,
      learning_rate=learning_rate, 
      reg_lambda=reg_lambda,
      reg_alpha=reg_alpha,
      subsample=subsample,
      colsample_bytree=colsample_bytree,
      max_depth=max_depth,
      )

    model.fit(X_train,y_train,early_stopping_rounds=100,eval_set=[(X_val,y_val)],verbose=False)

    preds_valid = model.predict(X_val)

    #Training model apply the test data and predict the output
    # test_predict = model.predict(X_test)
    # prediction.append(test_predict)
    accuracy= accuracy_score(y_val,preds_valid)

    #Score 
    # score.append(accuracy)
    # print(f"fold:{fold},accuracy:{accuracy}")

    return accuracy


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(hyp_optimizer, n_trials=100)

In [None]:
print(study.best_params)

Training the model again with the best parameters

In [None]:
prediction = []
score = []

for fold in range (10):
    X_train = train_df[train_df.kfold != fold].reset_index(drop=True)
    X_val = train_df[train_df.kfold == fold].reset_index(drop=True)
    X_test = test_df.copy()

    # dependent variables 
    y_train = X_train[TARGET]
    y_val = X_val[TARGET]

    # independent variables
    X_train = X_train[FEATURES]
    X_val = X_val[FEATURES]

    scaler = RobustScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)


    # XGBRegressor moddelling 
    model = XGBClassifier(tree_method='gpu_hist', 
                          gpu_id=0, 
                          predictor='gpu_predictor',
                          n_estimators=1000,
                          learning_rate=0.08995307165736259,
                          reg_lambda=0.0027207496457059585,
                          reg_alpha=8.932602131887264e-06,
                          subsample=0.8874060637993741,
                          colsample_bytree=0.7727343069497347,
                          max_depth=7
                         )
    model.fit(X_train,y_train,early_stopping_rounds=100,eval_set=[(X_val,y_val)],verbose=False)


    preds_valid = model.predict(X_val)

    #Training model apply the test data and predict the output
    test_predict = model.predict(X_test)
    prediction.append(test_predict)
    accuracy= accuracy_score(y_val,preds_valid)

    #Score 
    score.append(accuracy)
    print(f"fold:{fold},accuracy:{accuracy}")
    
print(np.mean(score),np.std(score))

In [None]:
submission_df = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')
submission_df

In [None]:
from scipy.stats import mode
xgb_submission = submission_df.copy()
xgb_submission["target"] = encoder.inverse_transform(np.squeeze(mode(np.column_stack(prediction),axis = 1)[0]).astype('int'))
xgb_submission.to_csv("xgb-subs_v.csv",index=False)
xgb_submission.head()

In [None]:
prediction