# 0. Introduction

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#variables

ID = 'row_id'
TARGET = 'target'
RANDOM_SEED= 42

In [None]:
#train_df = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/train.csv", index_col=ID) #alternative approach
#test_df = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv", index_col=ID)
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

# 1. EDA

In [None]:
train_df.groupby(TARGET)[TARGET].count()

Drop the unique ID column in both dataset

In [None]:
train_df.drop(ID, axis=1, inplace=True)
test_df.drop(ID, axis=1, inplace=True)

Thanks to [@teckmengwong](https://www.kaggle.com/teckmengwong) for the duplicate identification [here](https://www.kaggle.com/c/tabular-playground-series-feb-2022/discussion/305364#1701706)

In [None]:
train_df.duplicated().sum()

In [None]:
def drop_duplicates(df):
    df.drop_duplicates(keep='first', inplace=True)
    duplicates_train = df.duplicated().sum()

    print('Data shape:', df.shape)
    print('Duplicates:{0}'.format(duplicates_train))

In [None]:
drop_duplicates(train_df)

Do the Label Encoding for the target/label which is in object.

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
train_df[TARGET]=label.fit_transform(train_df[TARGET])
train_df.head()

In [None]:
y=train_df[TARGET]
train_df.drop(columns=[TARGET],inplace=True)

In [None]:
features = [ col for col in train_df.columns if col not in [TARGET]]

Thanks to [@ambrosm](https://www.kaggle.com/ambrosm) for [this](https://www.kaggle.com/ambrosm/tpsfeb22-01-eda-which-makes-sense) EDA analysis in which he identifies the GCD feature

In [None]:
from math import factorial

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

I created function to better reusable purpose

In [None]:
def scale_model_with_bias(df):
    df=pd.DataFrame({col:((df[col] + bias_of(col)) * 1000000).round().astype(int) for col in features})
    return df

In [None]:
train_df=scale_model_with_bias(train_df)
test_df= scale_model_with_bias(test_df)

train_df.head()

# 2. Feature Engineering

Basic feature engineering along with the above GCD feature

In [None]:
def feature_eng(df):
    df['mean'] = df[features].mean(axis=1)
    df['std'] = df[features].std(axis=1)
    df['min'] = df[features].min(axis=1)
    df['max'] = df[features].max(axis=1)
    df['gcd'] = np.gcd.reduce(df[features], axis=1)
    return df

In [None]:
train_df = feature_eng(train_df)
test_df = feature_eng(test_df)

features.extend(['mean', 'std', 'min', 'max','gcd'])

In [None]:
train_df.head()

As mentioned by Ambrosm, we see that there are four gcd values (1, 10, 1000 and 10000) with equal frequencies

In [None]:
np.unique(train_df['gcd'], return_counts=True), np.unique(test_df['gcd'], return_counts=True) 

In [None]:
train_df.groupby('gcd')['gcd'].count()

In [None]:
train_df.shape, test_df.shape

In [None]:
X=train_df
X.shape, y.shape

# 3. Model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X,y, random_state=RANDOM_SEED)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

In [None]:
import xgboost as xgb

xgbc_params = {
    "objective": "multi:softmax", # multi-class classification
    "num_class": 10, # number of class in the labels
    "use_label_encoder": False,  # need to use this parameter since I used LabelEncoder to avoid unnecessary warnings
    "tree_method": "gpu_hist"  # gpu usage
    ,'predictor': 'gpu_predictor'
    ,"max_depth": 10 # since number of class are 10, I used it and it improved the score. Unsure how
}


models = {"XGB": xgb.XGBClassifier(**xgbc_params)}

In [None]:
%%time 

xgb_classifier = models.get('XGB')

xgb_classifier.fit(X_train,
               y_train
               ,eval_set = [(X_train, y_train), (X_valid, y_valid)]
               ,eval_metric=['mlogloss'] # metric for multi-class classification
               ,verbose=False
               #,early_stopping_rounds=50 #useless - model is fooled by duplicate data in test data
               #,sample_weight=sample_weight[:X_train.shape[0]] #for me no improvement in score with sample weight
                  )

In [None]:
eval_result = xgb_classifier.evals_result()
training_rounds = range(len(eval_result['validation_0']['mlogloss']))

In [None]:
import matplotlib.pyplot as plt

def plot_logloss(rounds, eval_result):
    plt.scatter(x=training_rounds,y=eval_result['validation_0']['mlogloss'],label='Training Error')
    plt.scatter(x=training_rounds,y=eval_result['validation_1']['mlogloss'],label='Validation Error')
    plt.grid(True)
    plt.xlabel('Iteration')
    plt.ylabel('LogLoss')
    plt.title('Training Vs Validation Error')
    plt.legend()
    plt.show()

In [None]:
plot_logloss(training_rounds,eval_result)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
# make predictions for test data and evaluate
y_pred = xgb_classifier.predict(X_valid)
predictions = [round(value) for value in y_pred]
dtrain_predprob = xgb_classifier.predict_proba(X_valid)
accuracy = accuracy_score(y_valid, predictions)

print("Accuracy={}".format(round(accuracy,6)))
print("AUC Score(Train)={}".format(round(roc_auc_score(y_valid, dtrain_predprob, multi_class='ovr'),6)))

**3.1 Hyper Parameter Tunning using XGBoost**

In [None]:
def buildandTrainModelwithCV(alg, X, y,predictors,useTrainCV, cv_folds, early_stopping_rounds):
    
    test_predictions= []
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(data=X, label=y)
        
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds
            ,metrics='mlogloss', early_stopping_rounds=early_stopping_rounds
                         )
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X, y,eval_metric=['mlogloss']
            ,eval_set = [(X, y), (X, y)]
            ,verbose=False
            
           )
        
    #Predict training set
    dtrain_predictions = alg.predict(X)
    dtrain_predictions = [round(value) for value in dtrain_predictions]
    dtrain_predprob = alg.predict_proba(X_valid)    
    accuracy=accuracy_score(y, dtrain_predictions)
    
    #Print model report:
    print("\nModel Report")
    print("Validation Accuracy={}".format(round(accuracy,6)))
    print("AUC Score (Valid)={}".format(roc_auc_score(y_valid, dtrain_predprob, multi_class='ovr')))
    
    return alg.evals_result()

          

In [None]:
xgb1 = xgb.XGBClassifier(**xgbc_params)

In [None]:
%%time
eval_result=buildandTrainModelwithCV(xgb1, X, y, features, True, 5, 50)

When I got AUC Score of 1.0, I'm sure there is some bug in my model or noise in my training and validation data ( which I understand after the fact of Ambrosm EDA notebook )

In [None]:
plot_logloss_rounds = range(len(eval_result['validation_0']['mlogloss']))
plot_logloss(plot_logloss_rounds,eval_result)

In [None]:

test_predictions = xgb1.predict(test_df[features])
test_predictions = [round(value) for value in test_predictions]   

# 4. Retrospective

Sharing my lesson learnt in this competition:

1. Tried GridSearchCV to find optimal parameters, however the above XGBoost model parameters gives better accuracy than GridSearchCV best parameters
2. Standard features such min, max, std, mean improves the score approx 0.02 and max_depth parameter alone improved the score approx by 0.02
3. Creating base model ( template model ) without EDA and FE help me save some time when compare to last competitions. 
4. First, Reading the paper is very important and second, understanding it is important ( which I under-estimated )
5. Reading good discussion helped in identifying the key findings such as duplicate data and seeing some good code such as Ambrosm's EDA helps me reminded this point - *observe the data closely* before doing any FE and Modeling.

**Thanks to every Kaggler who shared their code, replied to my questions and for good discussion. I'm grateful to Kaggle community.**

**PS:** Please feel free to comment/suggest/blame anything in this notebook. Thanks in advance.

# 5. Submission

In [None]:
sub_df = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
sub_df[TARGET]=label.inverse_transform(test_predictions)

sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=False)