# Importing Libraries 

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.utils import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

# Read the Dataset

In [2]:
train_df = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
train_df

Unnamed: 0,id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,target
0,0,0,0,1,0,1,0,0,0,0,...,0,0,21,0,0,0,0,0,0,Class_2
1,1,0,0,0,0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,2,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,13,2,0,Class_1
3,3,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,1,0,Class_4
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,Class_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,0,0,0,1,0,0,0,2,0,...,0,0,6,0,0,3,0,0,0,Class_1
99996,99996,0,0,0,0,0,0,0,1,2,...,0,2,0,0,0,10,0,0,0,Class_2
99997,99997,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,2,0,Class_3
99998,99998,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,4,0,Class_2


# Exploratory Data Analysis (Still in Progress)

## Checking for Missing Data 

In [3]:
train_df.isnull().sum()

id            0
feature_0     0
feature_1     0
feature_2     0
feature_3     0
feature_4     0
feature_5     0
feature_6     0
feature_7     0
feature_8     0
feature_9     0
feature_10    0
feature_11    0
feature_12    0
feature_13    0
feature_14    0
feature_15    0
feature_16    0
feature_17    0
feature_18    0
feature_19    0
feature_20    0
feature_21    0
feature_22    0
feature_23    0
feature_24    0
feature_25    0
feature_26    0
feature_27    0
feature_28    0
feature_29    0
feature_30    0
feature_31    0
feature_32    0
feature_33    0
feature_34    0
feature_35    0
feature_36    0
feature_37    0
feature_38    0
feature_39    0
feature_40    0
feature_41    0
feature_42    0
feature_43    0
feature_44    0
feature_45    0
feature_46    0
feature_47    0
feature_48    0
feature_49    0
target        0
dtype: int64

## Target Value Distribution 

In [4]:
train_count_df = train_df.groupby('target').count()['id'].rename('Count').to_frame()
train_count_df.reset_index(level=0, inplace=True)
train_count_df 

Unnamed: 0,target,Count
0,Class_1,8490
1,Class_2,57497
2,Class_3,21420
3,Class_4,12593


In [5]:
pie_target = px.pie(train_count_df, values='Count', names='target')
pie_target.update_layout(title_text='Target Value Distribution')
pie_target.update_traces(textposition='inside', texttemplate = "%{label}: %{value} <br>(%{percent})")
pie_target.show()

## Checking the type of data 

In [6]:
train_df.dtypes

id             int64
feature_0      int64
feature_1      int64
feature_2      int64
feature_3      int64
feature_4      int64
feature_5      int64
feature_6      int64
feature_7      int64
feature_8      int64
feature_9      int64
feature_10     int64
feature_11     int64
feature_12     int64
feature_13     int64
feature_14     int64
feature_15     int64
feature_16     int64
feature_17     int64
feature_18     int64
feature_19     int64
feature_20     int64
feature_21     int64
feature_22     int64
feature_23     int64
feature_24     int64
feature_25     int64
feature_26     int64
feature_27     int64
feature_28     int64
feature_29     int64
feature_30     int64
feature_31     int64
feature_32     int64
feature_33     int64
feature_34     int64
feature_35     int64
feature_36     int64
feature_37     int64
feature_38     int64
feature_39     int64
feature_40     int64
feature_41     int64
feature_42     int64
feature_43     int64
feature_44     int64
feature_45     int64
feature_46   

In [7]:
pos_col = []
neg_col = []

In [8]:
for col in train_df.columns[1:-1]:
    if sum(n < 0 for n in train_df[col].unique()) == 0:
        pos_col.append(col)
    else:
        neg_col.append(col)

In [9]:
print('** Columns without negative values **\n')
print([col for col in pos_col])
print()
print('** Columns with negative values **\n')
print([col for col in neg_col])

** Columns without negative values **

['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_33', 'feature_34', 'feature_36', 'feature_37', 'feature_40', 'feature_41', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49']

** Columns with negative values **

['feature_19', 'feature_30', 'feature_31', 'feature_32', 'feature_35', 'feature_38', 'feature_39', 'feature_42']


In [10]:
# Get all feature columns
feature_cols = train_df.columns[train_df.columns.str.startswith('feature_')]

missmatch_df = pd.DataFrame()
missmatch = {}
for feature in feature_cols:
    # Get sorted unique values for training data 
    unique_vals_train = train_df[feature].unique()
    unique_vals_train.sort()
    # Get sorted unique values for test data 
    unique_vals_test = test_df[feature].unique()
    unique_vals_test.sort()

    # Compare number of unique values
    if len(unique_vals_train) == len(unique_vals_test):
        # Compare contents of unique values
        if (unique_vals_train==unique_vals_test).all():
            pass
        else:
            missmatch['feature'] = feature
            missmatch['train'] = unique_vals_train
            missmatch['test'] = unique_vals_test
            missmatch_df = missmatch_df.append(missmatch, ignore_index=True)
    else:
        # Missmatch is only relevant if test has more unseen values for train
        if len(unique_vals_train) < len(unique_vals_test):
            missmatch['feature'] = feature
            missmatch['train'] = unique_vals_train
            missmatch['test'] = unique_vals_test
            missmatch_df = missmatch_df.append(missmatch, ignore_index=True)
        else:
            pass

# Display
missmatch_df[['feature', 'train', 'test']].style.set_caption('Missmatch in Unique Values between Train and Test').set_properties(subset=['test', 'train'], **{'width': '300px'})

Unnamed: 0,feature,train,test
0,feature_1,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25 26 27 28 29 31],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31]
1,feature_3,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 26],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25]
2,feature_4,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31 32 33 34 35 37 38],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31 32 33 34 35 36 38]
3,feature_19,[-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21  22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45  46 47 48 49 50 51 52 53 54 55],[-2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21  22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 45 46  47 48 49 50 51 52 53 54 56 57]
4,feature_21,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 32 33 34 35 36],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25 26 27 28 29 30 31 32 33 34 35]
5,feature_25,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 23],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 24]
6,feature_34,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 26]
7,feature_37,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
8,feature_40,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
9,feature_41,[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25 26 27 28 30 32],[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23  24 25 26 27 28 29 31]


# Data Processing 

## Encoding the Target Variable

In [11]:
le = LabelEncoder()
le.fit(train_df['target'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)
train_df.target = le.transform(train_df.target)

{'Class_1': 0, 'Class_2': 1, 'Class_3': 2, 'Class_4': 3}


## One Hot Encoding the Variables
Since there is some mismatch, we have to combine them together so that we get to account for the mismatched columns

In [12]:
# Since train and test data have different unique values, 
# we concatenate them to one giant dataset for the encoding process
all_data = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# One Hot Encode target variable
ohex = ce.OneHotEncoder(handle_unknown='value', use_cat_names=True)
OH_features = pd.DataFrame(ohex.fit_transform(all_data[feature_cols].astype(str)))
OH_features

display(OH_features.head())

# Merge new OH encoded target to train_df 
all_data = pd.concat([all_data, OH_features], axis=1)

# Split train and test set again 
train_df = all_data[:len(train_df)]
test_df = all_data[len(train_df):].reset_index(drop=True)

Unnamed: 0,feature_0_0,feature_0_1,feature_0_9,feature_0_2,feature_0_3,feature_0_4,feature_0_7,feature_0_5,feature_0_6,feature_0_8,...,feature_49_12,feature_49_7,feature_49_13,feature_49_15,feature_49_14,feature_49_16,feature_49_18,feature_49_20,feature_49_19,feature_49_21
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# 'learning_rate': 0.004492219404193344, 'max_depth': 9

In [14]:
feature_cols = train_df.columns[train_df.columns.str.startswith('feature_')]

X = train_df[feature_cols]
y = train_df['target']

X_test = test_df[feature_cols]


N_SPLITS = 5

# Initialize variables
y_oof_pred = np.zeros((len(X), 4))
y_test_pred = np.zeros((len(X_test), 4))

kf = StratifiedKFold(n_splits = N_SPLITS)
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):    
    # Prepare training and validation data
    X_train = X.iloc[train_idx].reset_index(drop=True)
    X_val = X.iloc[val_idx].reset_index(drop=True)

    y_train = y.iloc[train_idx].reset_index(drop=True)
    y_val = y.iloc[val_idx].reset_index(drop=True)  
    
    eval_set = [(X_val, y_val)]
    # Define model
    model = CatBoostClassifier(depth=9,
            max_ctr_complexity=15,
            iterations=10000,
            od_wait=1000, od_type='Iter',
            learning_rate= 0.004492219404193344,
            min_data_in_leaf=1,
            use_best_model=True,
            loss_function='MultiClass', random_state=42, task_type='GPU')
    model.fit(X_train, y_train, eval_set= eval_set, verbose=1000)
    
    # Calculate evaluation metric
    y_val_pred = model.predict_proba(X_val)

    print(f"Fold {fold + 1} Log Loss: {log_loss(y_val, y_val_pred)}")

    # Make predictions
    y_oof_pred[val_idx] = y_val_pred
    y_test_pred += model.predict_proba(X_test)


# Calculate evaluation metric for out of fold validation set
y_test_pred = y_test_pred / N_SPLITS

print(f"Overall OOF Log Loss: {log_loss(y, y_oof_pred)}")

0:	learn: 1.3835643	test: 1.3835819	best: 1.3835819 (0)	total: 32ms	remaining: 5m 19s
1000:	learn: 1.0783736	test: 1.0999818	best: 1.0999818 (1000)	total: 25.5s	remaining: 3m 49s
2000:	learn: 1.0569836	test: 1.0953400	best: 1.0953400 (2000)	total: 50s	remaining: 3m 19s
3000:	learn: 1.0410080	test: 1.0937369	best: 1.0937369 (3000)	total: 1m 12s	remaining: 2m 49s
4000:	learn: 1.0273754	test: 1.0931018	best: 1.0931018 (4000)	total: 1m 36s	remaining: 2m 24s
5000:	learn: 1.0151645	test: 1.0928570	best: 1.0928548 (4998)	total: 1m 59s	remaining: 1m 59s
6000:	learn: 1.0034572	test: 1.0927972	best: 1.0927910 (5767)	total: 2m 20s	remaining: 1m 33s
bestTest = 1.092791016
bestIteration = 5767
Shrink model to first 5768 iterations.
Fold 1 Log Loss: 1.092791161953402
0:	learn: 1.3835685	test: 1.3835713	best: 1.3835713 (0)	total: 26.3ms	remaining: 4m 22s
1000:	learn: 1.0784086	test: 1.0997631	best: 1.0997631 (1000)	total: 25.1s	remaining: 3m 45s
2000:	learn: 1.0567739	test: 1.0951356	best: 1.0951342 

In [15]:
submission_df = pd.DataFrame(y_test_pred)
submission_df.columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4']
submission_df['id'] = test_df['id']
submission_df = submission_df[['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4']]

submission_df.to_csv("submission.csv", index=False)
display(submission_df.head())

Unnamed: 0,id,Class_1,Class_2,Class_3,Class_4
0,100000,0.089003,0.60353,0.176758,0.13071
1,100001,0.090875,0.685775,0.141295,0.082054
2,100002,0.079931,0.628679,0.188429,0.102961
3,100003,0.079611,0.552953,0.269067,0.098369
4,100004,0.070766,0.62708,0.199684,0.10247
