In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Importing Important Libraries

In [None]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss, confusion_matrix, classification_report
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

import warnings
warnings.simplefilter('ignore')

In [None]:
CFG = {
    'seed': 2021,
    'n_splits': 5,
    'verbose': 0,
    'target': 'target'
}

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(CFG['seed'])

In [None]:
def description(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['Mean'] = np.nanmean(df, axis=0).astype(df.dtypes)
    summary['Std'] = np.nanstd(df, axis=0).astype(df.dtypes)
    summary['Minimum'] = np.nanmin(df, axis=0).astype(df.dtypes)
    summary['Maximum'] = np.nanmax(df, axis=0).astype(df.dtypes)
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    summary['dimension'] = str(df.shape)
    return summary

### Loading the dataset

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

features = [col for col in train.columns if 'feature_' in col]

#### Removing the duplicates in traain data 

In [None]:
# train = train.drop_duplicates(subset=features+['target'])
train = train.drop_duplicates(subset=features, keep=False).reset_index(drop=True)


#### Tansforming the skewed data by using log normal transformation

In [None]:
train[features] = np.log1p(train[features])
test[features] = np.log1p(test[features])

### Preparing the dataset for modelling

In [None]:
target = train[CFG['target']].apply(lambda x: int(x.split("_")[-1])-1)
train_df = train[features]
test_df = test[features].reset_index(drop=True)

In [None]:
description(train_df).T

In [None]:
description(test_df).T

### Creating One-VS-Rest Classifier

In [None]:
kf = StratifiedKFold(n_splits=CFG['n_splits'], shuffle=True, random_state=CFG['seed'])

ovr_oof = np.zeros((train_df.shape[0], 9))
ovr_pred = 0

score_list = []

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=train_df, y=target)):
    print(f"===== FOLD {fold} =====")
    
    X_train, y_train = train_df.iloc[trn_idx], target.iloc[trn_idx]
    X_valid, y_valid = train_df.iloc[val_idx], target.iloc[val_idx]
    X_test = test_df
    
    # OVRClassifier
    estimator = HistGradientBoostingClassifier(
        max_iter=250,
        validation_fraction=None, 
        learning_rate=0.01, 
        max_depth=10, 
        min_samples_leaf=24, 
        max_leaf_nodes=60,
        random_state=2021
    )
    clf = OneVsRestClassifier(estimator, n_jobs=-1)
    clf.fit(X_train, y_train)
    
    ovr_oof[val_idx] = clf.predict_proba(X_valid)
    ovr_pred += clf.predict_proba(X_test) / CFG['n_splits']
    m_logloss = log_loss(y_valid, ovr_oof[val_idx])
    
    score_list.append([CFG['seed'], fold, m_logloss])
    print("+-" * 40)
    print(f"fold {fold} ovr multi_logloss: {m_logloss}\n")
    
m_logloss = log_loss(target, ovr_oof)
score_list.append(['avg', 'oof', m_logloss])
score_df = pd.DataFrame(score_list, columns=['seed', 'fold', 'logloss_score'])
score_df.to_csv("score.csv", index=False)

print("+-" * 40)
print(f"multi_logloss: {m_logloss}")

### Confusion matrix

In [None]:
cm = confusion_matrix(target, ovr_oof.argmax(axis=1))

plt.figure(figsize=((16,8)))
sns.heatmap(cm, annot=True, fmt='5d', cmap='Blues')
plt.savefig("confusion_matrix.png")

### classification Report

In [None]:
print(classification_report(target, ovr_oof.argmax(axis=1), digits=4))

report = pd.DataFrame(classification_report(target, ovr_oof.argmax(axis=1), digits=4, output_dict=True)).T
report.to_csv("report.csv")

### Preparing for submission

In [None]:
submission.iloc[:, 1:] = ovr_pred  
submission.to_csv("submission_01.csv", index=False)

In [None]:
plt.figure(figsize=(16, 4), tight_layout=True)
for i in range(9):
    #plt.subplot(3, 3, i+1)
    #plt.title(f"Class_{i+1}")
    plt.hist(submission[f'Class_{i+1}'], label=f'Class_{i+1}', bins=20, alpha=0.7)
plt.legend()