In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import mode

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from matplotlib import ticker
import time
import warnings

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)

warnings.filterwarnings('ignore')

RANDOM_STATE = 12
FOLDS = 5

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

In [None]:
train.info()

In [None]:

for i in train.iloc[:, :-1]:
    col_idx = train.columns.get_loc(i)
    val = train.iloc[:, col_idx]
    val_counts = val.value_counts().count() <= 100
    if val_counts:
        print(f'{i} {col_idx}: {val.value_counts().count()}')


In [None]:
train.head()

In [None]:
print(f'\033[93mTRAINING DATA ROWS NUMBER: {train.shape[0]}')
print(f'\033[96mTRAINING DATA COLS NUMBER: {train.shape[1]}')
print(f'\033[94mTRAINING DATA VALUES NUMBER: {train.count().sum()}')
print(f'\033[95mTRAINING DATA MISSING VALUES NUMBER: {sum(train.isna().sum())}')

In [None]:
train.describe()

In [None]:
test.head()

In [None]:
print(f'\033[92mTEST DATA ROWS NUMBER: {test.shape[0]}')
print(f'\033[96mTEST DATA COLS NUMBER: {test.shape[1]}')
print(f'\033[95mTEST DATA VALUES NUMBER: {test.count().sum()}')
print(f'\033[91mTEST DATA MISSING VALUES NUMBER: {sum(test.isna().sum())}')

In [None]:
test.describe()

In [None]:
submission.head()

In [None]:
train.drop('row_id', axis=1, inplace=True)
test.drop('row_id', axis=1, inplace=True)

TARGET = 'target'
FEATURES = [col for col in train.columns if col not in ['row_id', TARGET]]

In [None]:
df = pd.concat([train[FEATURES], test[FEATURES]], axis=0)
cat_train_test_features = [col for col in FEATURES if df[col].nunique() < 25]
cont_train_test_features = [col for col in FEATURES if df[col].nunique() >= 25]

del df
print(f'\033[93TOTAL FEATURES NUMBER: {len(FEATURES)}')
print(f'\033[95CATEGORICAL FEATURES NUMBER: {len(cat_train_test_features)}')
print(f'\033[91CONTINUOUS FEATURES NUMBER: {len(cont_train_test_features)}')

plt.pie([len(cat_train_test_features), len(cont_train_test_features)],
        labels=['CATEGORICAL', 'CONTINUOUS'],
        colors=['#c40500', '#12f9ff'],
        textprops={'fontsize': 16},
        autopct='%1.1f%%')
plt.show()

In [None]:
def EDA(features, rows, cols, width, height, train_color, test_color):
    fig, axes = plt.subplots(rows, cols, figsize=(width, height))
    for row in range(rows):
        for col in range(cols):
            current_col = features[row * cols + col]
            sns.kdeplot(x=train[current_col], ax=axes[row, col], color=train_color, label='TRAIN DATA', fill=True)
            sns.kdeplot(x=test[current_col], ax=axes[row, col], color=test_color, label='TEST DATA', fill=True)
            axes[row, col].legend()
            axes[row, col].set_ylabel('')
            axes[row, col].set_xlabel(current_col, fontsize=8)
            axes[row, col].tick_params(labelsize=5, width=0.5)
            axes[row, col].xaxis.offsetText.set_fontsize(6)
            axes[row, col].yaxis.offsetText.set_fontsize(4)
            
    plt.show

EDA(cont_train_test_features[:100], 20, 5, 25, 15*4, '#00bfc4', '#c400bf')
EDA(cont_train_test_features[100:200], 20, 5, 25, 15*4, '#f9ff12', '#83ff12')
EDA(cont_train_test_features[200:], 15, 5, 25, 45, '#8e12ff', '#83ff12')

In [None]:
print(f'\033[94mCATEGORICAL FEATURES NUMBER')
print(f'\033[92mALL FEATURE DISTRIBUTION WITH'
      ' LESS THAN 25 UNIQUE VALUES PLOTTED ABOVE'
      ' WITH CONTINUOUS FEATURE DISTRIBUTION')
print(f'\033[95mUNIQUE VALUE COUNT OF THE CATEGORICAL FEATURES:')

for cat in cat_train_test_features:
    print(str(cat) + ' - ' + str(train[cat].nunique()))

In [None]:
target_df = pd.DataFrame(train[TARGET].value_counts()).reset_index()
target_df

In [None]:
target_df.columns = [TARGET, 'count']
target_df

In [None]:
fig = px.bar(data_frame=target_df,
             x=TARGET,
             y='count',
             color='count',
             color_continuous_scale='ylgn')
fig.update_layout(template='ggplot2')
for idx, target in enumerate(target_df['target']):
    print('\033[95m' + str(target) + '  \033[91mCATEGORY PERCENTAGE: {:.2F} %'.format(target_df['count'][idx] * 100 / train.shape[0]))
fig.show()

In [None]:
def basic_FE(df, axis):
    statistics = {
        'mean': df[FEATURES].mean(axis=axis),
        'std': df[FEATURES].std(axis=axis),
        'min': df[FEATURES].min(axis=axis),
        'max': df[FEATURES].max(axis=axis)
    }
    
    for statistic, value in statistics.items():
        df[statistic] = value
basic_FE(train, 1)
basic_FE(test, 1)

FEATURES.extend(['mean', 'std', 'min', 'max'])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train[TARGET] = encoder.fit_transform(train[TARGET])

In [None]:
lgb_params={
    'objective': 'multiclss',
    'metric': 'multi_logloss', 
#     'device': 'gpu'
}

lgb_predictions = []
lgb_scores = []
lgb_fimp = []

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train[FEATURES], train[TARGET])):
    print(10 * '=', f'FOLD: {fold + 1}', 10 * '=')
    start_time = time.time()
    
    X_train, X_valid = train.iloc[train_idx][FEATURES], train.iloc[valid_idx][FEATURES]
    y_train, y_valid = train[TARGET].iloc[train_idx], train[TARGET].iloc[valid_idx]
    
    model = LGBMClassifier(**lgb_params)
    model.fit(X_train, y_train, verbose=0)
    
    preds_valid = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds_valid)
    lgb_scores.append(acc)
    run_time = time.time() - start_time
    
    print(f'FOLD: {fold + 1}, ACCURACY: {acc:.2f}, RUN TIME: {run_time:.2f}s')
    fim = pd.DataFrame(index=FEATURES,
                       data=model.feature_importances_,
                       columns=[f'{fold}_importance'])
    lgb_fimp.append(fim)
    test_preds = model.predict(test[FEATURES])
    lgb_predictions.append(test_preds)
    
print(f'MEAN ACCURACY: ', np.mean(lgb_scores))

In [None]:
lgbm_fis_df = pd.concat(lgb_fimp, axis=1).head(15)
lgbm_fis_df.sort_values('1_importance').plot(kind='barh', figsize=(15, 10),
                                            title='FEATURE IMPORTANCE ACROSS FOLDS')
plt.show()

In [None]:
catb_params = {
    'objective': 'MultiClass',
#     'task_type': 'GPU'
}

catb_predictions = []
catb_scores = []
catb_fimp = []

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train[FEATURES], train[TARGET])):
    
    print(10 * '=', f'FOLD: {fold + 1}', 10 * '=')
    start_time = time.time()
    
    X_train, X_valid = train.iloc[train_idx][FEATURES], train.iloc[valid_idx][FEATURES]
    y_train, y_valid = train[TARGET].iloc[train_idx], train[TARGET].iloc[valid_idx]
    
    model = CatBoostClassifier(**catb_params)
    model.fit(X_train, y_train, verbose=0)
    
    preds_valid = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds_valid)
    catb_scores.append(acc)
    run_time = time.time() - start_time
    
    print(f'FOLD: {fold + 1}, ACCURACY: {acc:.2f}, RUN TIME: {run_time:.2f}s')
    fim = pd.DataFrame(index=FEATURES,
                       data=model.feature_importances_,
                       columns=[f'{fold}_importance']
                       )
    catb_fimp.append(fim)
    test_preds = model.predict(test[FEATURES])
    catb_predictions.append(test_preds)
    
print(f'MEAN ACCURACY: ', np.mean(catb_scores))

In [None]:
catb_fis_df = pd.concat(catb_fimp, axis=1).head(15)
catb_fis_df.sort_values('1_importance').plot(kind='barh', figsize=(15, 10),
                                            title='FEATURE IMPORTANCE ACROSS FOLDS')
plt.show()

In [None]:
lgb_submission = submission.copy()
lgb_submission[TARGET] = encoder.inverse_transform(np.squeeze(mode(np.column_stack(lgb_predictions), axis=1)[0]).astype(int))
lgb_submission.to_csv('lgb_submission_01.csv', index=False)
lgb_submission.head()

In [None]:
catb_submission = submission.copy()
catb_submission[TARGET] = encoder.inverse_transform(np.squeeze(mode(np.column_stack(catb_predictions), axis=1)[0]).astype(int))
catb_submission.to_csv('catb_submission_01.csv', index=False)
catb_submission.head()

In [None]:
xgb_params = {
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
#     'tree_method': 'gpu_hist',
#     'predictor': 'gpu_predictor'
}

xgb_predictions = []
xgb_scores = []
xgb_fimp = []

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE)
for fold, (train_idx, valid_idx) in enumerate(skf.split(train[FEATURES], train[TARGET])):
    
    print(10 * '=', f'FOLD:{fold + 1}', 10 * '=')
    start_time = time.time()
    
    X_train, X_valid = train.iloc[train_idx][FEATURES], train.iloc[valid_idx][FEATURES]
    y_train, y_valid = train.iloc[train_idx][TARGET], train.iloc[valid_idx][TARGET]
    
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train, verbose=0)
    
    preds_valid = model.predict(X_valid)
    acc = accuracy_score(y_valid, preds_valid)
    xgb_scores.append(acc)
    run_time = time.time() - start_time
    
    print(f'FOLD: {fold + 1} ACCURACY: {acc:.2f}, RUN TIME: {run_time:.2f}s')
    test_preds = model.predict(test[FEATURES])
    fim = pd.DataFrame(index=FEATURES,
                       data=model.feature_importances_,
                       columns=[f'{fold}_importance'])
    xgb_fimp.append(fim)
    xgb_predictions.append(test_preds)
    
print('MEAN ACCURACY: ', np.mean(xgb_scores))

In [None]:
xgb_fis_df = pd.concat(xgb_fimp, axis=1).head(15)
xgb_fis_df.sort_values('1_importance').plot(kind='barh',
                                            figsize=(15, 10),
                                            title='FEATURE IMPORTANCE ACROSS FOLDS')
plt.show()

In [None]:
xgb_submission = submission.copy()
xgb_submission[TARGET] = encoder.inverse_transform(np.squeeze(mode(np.column_stack(xgb_predictions), axis=1)[0]).astype(int))
xgb_submission.to_csv('xgb_submission_01.csv', index=False)
xgb_submission.head()