In [None]:
import numpy as np
import pandas as pd
import plotly as py
from statistics import mean
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from umap import UMAP

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold

import optuna

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import ExtraTreesClassifier

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate

np.random.seed(228)
tf.random.set_seed(228)

pd.set_option('display.max_columns', None)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv', index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv', index_col='row_id')
ss = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('MEMORY USAGE OF DATAFRAME IS: {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col]= df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('MEMORY USAGE AFTER OPTIMIZATION IS: {:.2f} MB'.format(end_mem))
    print('DECREASED BY: {:.1f} %'.format((start_mem - end_mem) / start_mem * 100))
    
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
def info(data):
    print(f'LENGTH OF DATA:\n{len(data)}\n')
    
    info_dtypes = pd.Series([data[col].dtypes for col in data.columns.tolist()])
    print(f'INFO_DTYPES:\n{info_dtypes}\n')
    print(f'VALUE_COUNTS:\n{info_dtypes.value_counts()}\n')
    print(f'TO_FRAME:\n{info_dtypes.value_counts().to_frame()}\n')
    print(f'RESET_INDEX:\n{info_dtypes.value_counts().to_frame().reset_index()}\n')
    print(f"RENAME:\n{info_dtypes.value_counts().to_frame().reset_index().rename(columns={0: 'COUNT', 'index': 'TYPE'})}\n")
    
#     print(f'DATA.ISNA: \n{data.isna()}\n')
    print(f'DATA.ISNA.SUM:\n{data.isna().sum()}\n')
    print(f'DATA.ISNA.SUM.SUM:\n{data.isna().sum().sum()}\n')
    
    if data.isna().sum().sum() > 0:
        print(f'MISSING_VALUES:\n{data.isna().sum().sum()} ({round(data.isna().sum().sum() / len(data) * 100, 2)} %)\n')
    else:
        print(f'MISSING VALUES: {data.isna().sum().sum()}\n')
        
#     print(f'DATA.DUPLICATED:\n{data.duplicated()}\n')
    if data.duplicated().sum() > 0:
        print(f'DUPLICATED VALUES:\n{data.duplicated().sum()} ({round(data.duplicated().sum() / len(data) * 100, 2)} %)\n')
    else:
        print(f'DUPLICATED VALUES:\n{data.duplicated().sum()}\n')
        
    try:
        print(f'DATA.TARGET.NUNIQUE:\n{data.target.nunique()}\n')
    except:
        pass

print('TRAINING DATASET INFORMATION:\n')
info(train)
print(30 * '-')
print('TEST DATASET INFORMATION:\n')
info(test)

In [None]:
pd.set_option('display.max_rows', None)
train[train.columns.tolist()[:-1]].describe().transpose().sort_values('mean')[['mean', 'std', 'min', 'max']]\
            .style.background_gradient(cmap='Purples')

In [None]:
pd.options.display.max_rows = 60

plt.figure(figsize=(12, 4))
sns.set_style('white')
plt.title('TARGET DISTRIBUTION WITHOUT DUPLICATES DROPPING',
          fontname='monospace',
          fontsize=18,
          color='#ff008c',
          x=0.5,
          y=1.1)
ax = sns.barplot(data=train['target'].value_counts().reset_index(),
                x='target',
                y='index',
#                 palette=(['#b3fce2', '#75ffce', '#38ffb8', '#00ffa4', '#00c9c4', '#0099db', '#0363ff', '#0509ff', '#0003ad', '#000263'][::-1]),
                palette='PiYG',
                linestyle='-',
                linewidth=1,
                edgecolor='#001a19')
plt.xticks([])
plt.yticks(fontname='monospace', size=12, color='#0dff00')
plt.xlabel('')
plt.ylabel('')

for spine in ['right', 'top', 'left', 'bottom']:
    ax.spines[spine].set_visible(False)
    
for patch in ax.patches:
    width = patch.get_width()
    plt.text(900 + width,
             patch.get_y() + 0.6 * patch.get_height(),
             f'{round(width / len(train) * 100, 2)} %',
             ha='center',
             va='center',
             fontname='monospace',
             fontsize=12,
             color='#ff00d4')
    
plt.show()

In [None]:
train.drop_duplicates(inplace=True)

In [None]:
plt.figure(figsize=(12, 4))
sns.set_style('white')
plt.title('TARGET DISTRIBUTION WITH DUPLICATES DROPPING',
          fontname='monospace',
          fontsize=18,
          color='#006161',
          x=0.5,
          y=1.1)
ax = sns.barplot(data=train['target'].value_counts().reset_index(),
                 x='target',
                 y='index',
                 palette='Spectral',
                 linestyle='-',
                 linewidth=1,
                 edgecolor='#005927')
plt.xticks([])
plt.yticks(fontname='monospace', size=12, color='#e64e02')
plt.xlabel('')
plt.ylabel('')

for spine in ['right', 'top', 'left', 'bottom']:
    ax.spines[spine].set_visible(False)
    
for patch in ax.patches:
    width = patch.get_width()
    plt.text(900 + width,
             patch.get_y() + 0.6 * patch.get_height(),
             f'{round(width / len(train) * 100, 2)} %',
             ha='center',
             va='center',
             fontname='monospace',
             fontsize=12,
             color='red')

plt.show()

In [None]:
retarget = {train['target'].value_counts().reset_index()['index'][i]: i for i in range(len(train['target'].value_counts()))}
retarget_2 = {i: key for key, i in retarget.items()}

train['target'] = train['target'].map(retarget)

umap = UMAP(n_components=2,
            n_neighbors=10,
            min_dist=0.99).fit_transform(train.drop('target', axis=1).sample(15000, random_state=228),
            train['target'].sample(15000, random_state=228))

plt.figure(figsize=(15, 12))
plt.title('TARGET UMAP', size=20, y=1.03, fontname='monospace')
scatter_umap = sns.scatterplot(x=umap[:, 0],
                               y=umap[:, 1],
                               hue=train['target'].sample(15000, random_state=228),
                               s=5,
                               edgecolor='none',
                               alpha=0.8,
                               palette='hsv')
plt.xticks([])
plt.yticks([])
for spine in ['right', 'top', 'left']:
    scatter_umap.spines[spine].set_visible(False)

plt.legend(ncol=2, borderpad=1, frameon=True, fontsize=11)
scatter_umap.text(12, -23, 
                  '''n_components=2
n_neighbors=10
min_dist=0.99''',
                  fontname='monospace',
                  fontsize=12)
plt.legend(labels=list(retarget.keys()),
           title='BACTERIA SPECIES',
           ncol=2,
           borderpad=1,
           frameon=False,
           fontsize=12,
           bbox_to_anchor=(0.5, 0))
plt.show()

In [None]:
features = train.columns.tolist()[0:-1]

def statistics(df):
    df['std'] = df[features].std(axis=1)
    df['min'] = df[features].min(axis=1)
    df['max'] = df[features].max(axis=1)
    
statistics(train)
statistics(test)

features += ['std', 'min', 'max']

le = LabelEncoder()
train['target'] = le.fit_transform(train['target'])

sc = StandardScaler()

train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

X = train[features]
y = train['target']

In [None]:
xgb_params = {
    'max_depth': 8,
    'learning_rate': 0.2478225904887278,
    'min_child_weight': 8,
    'gamma': 0.018329940112279165,
    'alpha': 0.00019394894279195157,
    'lambda': 0.06161761858777205,
    'colsample_bytree': 0.6721122683333417,
    'subsample': 0.6155733760919804,
    'n_estimators': 3000,
#     'tree_method': 'gpu_hist',
    'booster': 'gbtree',
    'random_state': 228,
    'use_label_encoder': False,
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
#     'predictor': 'gpu_predictor'
}

In [None]:
predictions, scores = [], []

skf = StratifiedKFold(n_splits=10, random_state=228, shuffle=True)
for i, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]
    
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False, early_stopping_rounds=30)
    
    val_pred = model.predict(X_val)
    val_score = accuracy_score(y_val, val_pred)
    print(f'FOLD: {i + 1} ACCURACY SCORE: {round(val_score, 4)}')
    
    scores.append(val_score)
    predictions.append(model.predict(test))
    
print(f'\nMEAN ACCURACY: {round(mean(scores), 4)}')

In [None]:
ss['target'] = stats.mode(np.column_stack(predictions), axis=1)[0]
ss['target'] = ss['target'].map(retarget_2)
ss.to_csv('submission_02.csv')