In [None]:
import numpy as np
import pandas as pd
import os, gc
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import shap
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = [10, 7] # Make plots bigger

# Pandas show more columns
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

 Following https://www.kaggle.com/returnofsputnik/may-tabular-coreys-eda I hypothesize that every feature is actually a category, not a numeric feature. If that's the case, we have to take a few steps.
 1. Check if there are any categories in the test that are not in the train
 2. Run LightGBM, tune the categorical hyperparameters (e.g. cat_l2)
 3. Run SHAP, see which features are most important, may be helpful for feature engineering inspiration
 3. Feature engineer and re-run, does it have any impact?
 4. Run SHAP, see if anything else interesting comes out
 5. Rerun blindly on full set with same hyperparameters
 6. Potentially pseudolabel and rerun

In [None]:
tr = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/train.csv')
te = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/test.csv')

# Before we begin, for my sanity, let's just convert the target column into integers. 
Right now it's "Class_1", "Class_2", "Class_3", "Class_4" so the column is an object. Let's convert this to a numeric columns.

In [None]:
target_map = {'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3}
reverse_target_map = {0:'Class_1', 1:'Class_2', 2:'Class_3', 3:'Class_4'}

tr['target'] = pd.to_numeric(tr['target'].map(target_map))

In [None]:
feature_cols = [c for c in tr.columns if 'feature_' in c]

In [None]:
# Concatenate the test to train
te['target'] = -1

tr_te = pd.concat([tr, te],axis=0,sort=True).reset_index(drop=True)
tr_te['is_train'] = tr_te['target'] != -1

# Step 1: Any categories in test that are not in train? Any categories in train that are not in test?

In [None]:
for col in feature_cols:
    test_vals_not_in_train = set(te[col]) - set(tr[col])
    train_vals_not_in_test = set(tr[col]) - set(te[col])
    print(col, 'Test\Train:', test_vals_not_in_train, 'Train\Test:', train_vals_not_in_test)
    if len(test_vals_not_in_train) > 0:
        for val in test_vals_not_in_train:
            print(val, 'Value has number of rows:', tr_te.loc[tr_te[col]==val].shape[0])
            
    if len(train_vals_not_in_test) > 0:
        for val in train_vals_not_in_test:
            print(val, 'Value has number of rows:', tr_te.loc[tr_te[col]==val].shape[0])

# So we see that the maximum number of rows for a missing value is 11. In order to increase regularization and robustness, let's overwrite any values that have fewer than 20 samples with -99, just to represent the idea of "rare value".

In [None]:
for col in tqdm(feature_cols):
    tr_te[col+'_modified'] = tr_te[col].copy()
    for val in tr_te[col].unique():
        val_mask = tr_te[col] == val
        if tr_te.loc[val_mask].shape[0] < 20: # If fewer than 20 rows
            tr_te.loc[val_mask, col+'_modified'] = -99 # Overwrite with -99 

# Additionally, because I believe these to all be categories, let's Label Encode them.

In [None]:
for col in tqdm(feature_cols):
    col = col + '_modified'
    tr_te[col] = tr_te[col].astype('category').cat.codes

# Okay, now let's perform LightGBM.

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle = True,  random_state = 2021)

In [None]:
tr = tr_te.loc[tr_te['target']!=-1].copy()
te = tr_te.loc[tr_te['target']==-1].copy()

In [None]:
cols_to_use = [f'feature_{x}_modified' for x in range(50)]
target_col = 'target'

In [None]:
# Label the fold numbers in the train set
tr['fold_number'] = -1 # Initialize fold number
for fold_number, (train_index, valid_index) in enumerate(skf.split(tr[cols_to_use], tr[target_col])):
    tr.loc[valid_index, 'fold_number'] = fold_number

In [None]:
# Double check the folds are distributed evenly
tr['fold_number'].value_counts()

In [None]:
tr.groupby('fold_number')['target'].value_counts()

# Loop through all 5 folds

In [None]:
tr['oof_predictions_class0'] = 0
tr['oof_predictions_class1'] = 0
tr['oof_predictions_class2'] = 0
tr['oof_predictions_class3'] = 0

te['predictions_class0'] = 0
te['predictions_class1'] = 0
te['predictions_class2'] = 0
te['predictions_class3'] = 0

gain_imps = {}
for col in cols_to_use:
    gain_imps[col] = 0 # Initialize to 0

for fold_number in [0,1,2,3,4]:
    train_mask = tr['fold_number'] != fold_number
    valid_mask = tr['fold_number'] == fold_number
    
    # Create the LightGBM Datasets
    dtrain = lgb.Dataset(data = tr.loc[train_mask, cols_to_use], 
                         label = tr.loc[train_mask, target_col],
                         categorical_feature = list(range(50))) # Pass the indices of the categorical features

    dvalid = lgb.Dataset(data = tr.loc[valid_mask, cols_to_use], 
                         label = tr.loc[valid_mask, target_col],
                         categorical_feature = list(range(50))) # Pass the indices of the categorical features

    np.random.seed(fold_number)
    # Define parameters
    # Note a different seed for each fold so the models can be different 
    # (Model Diversity = more robust predictions)
    params = {
        'objective': 'multiclass',
        'num_class': 4, # Only used in multiclass
        'metric': ['multi_logloss','multi_error'], 
        'boosting_type': 'gbdt',
        'num_leaves': 31, 
        'max_depth': 11, 
        'learning_rate': 0.05, 
        'feature_fraction': 0.5, 
        'seed': 2021 + fold_number,
        'cat_l2': 10,
        'cat_smooth': 10,
        'verbose':-1
    }
    
    print(f'=========== Fold {fold_number} ===========')
    # Train the model
    model = lgb.train(
        params=params,
        train_set = dtrain,
        num_boost_round=1000,
        valid_sets=[dtrain, dvalid],
        verbose_eval = 50,
        early_stopping_rounds=100,
        categorical_feature=list(range(50))
    )
    
    
    # Make your predictions
    oof_preds = model.predict(tr.loc[valid_mask, cols_to_use])
    test_preds = model.predict(te[cols_to_use])
    for class_num in range(4):
        tr.loc[valid_mask, f'oof_predictions_class{class_num}'] = oof_preds[:,class_num]
        te[f'predictions_class{class_num}'] += test_preds[:,class_num]
        
    
    # If it's the first fold, make SHAP predictions so we can explain model
    if fold_number == 0:
        print(f'=========== Performing SHAP Explainer ===========')
        # Commented this out because it takes too long...
        # Maybe SHAP is slow with categorical values?
        # shap_values = shap.Explainer(model).shap_values(tr[cols_to_use])
        
    # Save out the gain importances instead
    for col, imp in zip(cols_to_use, model.feature_importance(importance_type='gain')):
        gain_imps[col] += imp
    
for class_num in range(4):
    te[f'predictions_class{class_num}'] /= 5
    
for col in cols_to_use:
    gain_imps[col] /= 5

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html
gain_imps_df = pd.DataFrame.from_dict(gain_imps, orient='index').reset_index(drop=False)
gain_imps_df.columns = ['feat','imp']
gain_imps_df = gain_imps_df.sort_values('imp',ascending=False)

# Also attach the nunique for each
gain_imps_df['nunique'] = gain_imps_df['feat'].apply(lambda x: tr_te[x].nunique())
gain_imps_df

# Let's look at the log loss?

In [None]:
from sklearn.metrics import log_loss

In [None]:
log_loss(y_true = tr['target'], 
         y_pred=tr[[f'oof_predictions_class{class_num}' for class_num in range(4)]])

# Can we improve this at all?
I remember in one competition that taking the counts of each feature ended up improving the cross-validation. Maybe we can do a similar thing?

In [None]:
for col_num in tqdm(range(50)):
    col = f'feature_{col_num}'
    normalized_frequency = tr_te[col].value_counts(normalize=True).to_dict()
    tr_te[col+'_count'] = tr_te[col].map(normalized_frequency)

In [None]:
count_cols = [f'feature_{col_num}_count' for col_num in range(50)]
tr_te['avg_count'] = tr_te[count_cols].mean(axis=1)
tr_te['min_count'] = tr_te[count_cols].min(axis=1)
tr_te['max_count'] = tr_te[count_cols].max(axis=1)
tr_te['std_count'] = tr_te[count_cols].std(axis=1)
tr_te['max_minus_min_count'] = tr_te['max_count'] - tr_te['min_count']

In [None]:
for col_num in tqdm(range(50)):
    col = f'feature_{col_num}'
    print(col, tr_te[col].value_counts().index[0])

In [None]:
for col_num in tqdm(range(50)):
    col = f'feature_{col_num}'
    tr_te[col+'_uncommon'] = pd.to_numeric(tr_te[col]!=0) # Since 0 is the most common class

In [None]:
uncommon_cols = [f'feature_{col_num}_uncommon' for col_num in range(50)]
tr_te['avg_uncommonness'] = tr_te[uncommon_cols].mean(axis=1)

# Aight, let's split back out and retry

In [None]:
tr = tr_te.loc[tr_te['target']!=-1].copy()
te = tr_te.loc[tr_te['target']==-1].copy()

In [None]:
tr_te

In [None]:
cols_to_use = [f'feature_{x}_modified' for x in range(50)]+count_cols+['avg_count','min_count','max_count','std_count','max_minus_min_count']+uncommon_cols+['avg_uncommonness']
target_col = 'target'

In [None]:
# Label the fold numbers in the train set
tr['fold_number'] = -1 # Initialize fold number
for fold_number, (train_index, valid_index) in enumerate(skf.split(tr[cols_to_use], tr[target_col])):
    tr.loc[valid_index, 'fold_number'] = fold_number

In [None]:
tr.groupby('fold_number')['target'].value_counts()

In [None]:
tr['oof_predictions_class0'] = 0
tr['oof_predictions_class1'] = 0
tr['oof_predictions_class2'] = 0
tr['oof_predictions_class3'] = 0

te['predictions_class0'] = 0
te['predictions_class1'] = 0
te['predictions_class2'] = 0
te['predictions_class3'] = 0

gain_imps = {}
for col in cols_to_use:
    gain_imps[col] = 0 # Initialize to 0

for fold_number in [0,1,2,3,4]:
    train_mask = tr['fold_number'] != fold_number
    valid_mask = tr['fold_number'] == fold_number
    
    # Create the LightGBM Datasets
    dtrain = lgb.Dataset(data = tr.loc[train_mask, cols_to_use], 
                         label = tr.loc[train_mask, target_col],
                         categorical_feature = list(range(50))) # Pass the indices of the categorical features

    dvalid = lgb.Dataset(data = tr.loc[valid_mask, cols_to_use], 
                         label = tr.loc[valid_mask, target_col],
                         categorical_feature = list(range(50))) # Pass the indices of the categorical features

    np.random.seed(fold_number)
    # Define parameters
    # Note a different seed for each fold so the models can be different 
    # (Model Diversity = more robust predictions)
    params = {
        'objective': 'multiclass',
        'num_class': 4, # Only used in multiclass
        'metric': ['multi_logloss','multi_error'], 
        'boosting_type': 'gbdt',
        'num_leaves': 31, 
        'max_depth': 11, 
        'learning_rate': 0.05, 
        'feature_fraction': 0.5, 
        'seed': 2021 + fold_number,
        'cat_l2': 10,
        'cat_smooth': 10,
        'verbose':-1
    }
    
    print(f'=========== Fold {fold_number} ===========')
    # Train the model
    model = lgb.train(
        params=params,
        train_set = dtrain,
        num_boost_round=1000,
        valid_sets=[dtrain, dvalid],
        verbose_eval = 50,
        early_stopping_rounds=100,
        categorical_feature=list(range(50))
    )
    
    
    # Make your predictions
    oof_preds = model.predict(tr.loc[valid_mask, cols_to_use])
    test_preds = model.predict(te[cols_to_use])
    for class_num in range(4):
        tr.loc[valid_mask, f'oof_predictions_class{class_num}'] = oof_preds[:,class_num]
        te[f'predictions_class{class_num}'] += test_preds[:,class_num]
        
    
    # If it's the first fold, make SHAP predictions so we can explain model
    if fold_number == 0:
        print(f'=========== Performing SHAP Explainer ===========')
        # Commented this out because it takes too long...
        # Maybe SHAP is slow with categorical values?
        # shap_values = shap.Explainer(model).shap_values(tr[cols_to_use])
        
    # Save out the gain importances instead
    for col, imp in zip(cols_to_use, model.feature_importance(importance_type='gain')):
        gain_imps[col] += imp
    
for class_num in range(4):
    te[f'predictions_class{class_num}'] /= 5
    
for col in cols_to_use:
    gain_imps[col] /= 5

# And study the gain importances:

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html
gain_imps_df = pd.DataFrame.from_dict(gain_imps, orient='index').reset_index(drop=False)
gain_imps_df.columns = ['feat','imp']
gain_imps_df = gain_imps_df.sort_values('imp',ascending=False)

# Also attach the nunique for each
gain_imps_df['nunique'] = gain_imps_df['feat'].apply(lambda x: tr_te[x].nunique())
gain_imps_df.head(50)

In [None]:
# Least important:
gain_imps_df.tail(50)

# Get our new log loss:

In [None]:
log_loss(y_true = tr['target'], 
         y_pred=tr[[f'oof_predictions_class{class_num}' for class_num in range(4)]])

# Save out predictions

In [None]:
saveout = te[['id','predictions_class0','predictions_class1','predictions_class2','predictions_class3']]
saveout.columns = ['id','Class_1','Class_2','Class_3','Class_4']
saveout.to_csv('submission.csv', index=False)

In [None]:
# View predictions
saveout