In [None]:
import numpy as np
import pandas as pd
import string 
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from category_encoders import TargetEncoder

import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')

target=df_train['target']

N_FOLDS=10
SEED=2021

In [None]:
def summary(df):    
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name', 'dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    return summary

summary_data=summary(df_train)
print(summary_data)

In [None]:
fet_cat_list   = df_train.select_dtypes(include='object').columns.tolist()
fet_num_list   = df_train.select_dtypes(include='float64').columns.tolist()
fet_cat_binay  = [cat for cat in fet_cat_list if df_train[cat].nunique() ==2]
fet_cat_no_bin = [cat for cat in fet_cat_list if cat not in fet_cat_binay]
fet_cat_long   = ['cat5','cat7','cat8','cat10']
fet_cat_short  = [cat for cat in fet_cat_list if cat not in fet_cat_long]

In [None]:
#Scale numerical Features.
scaler = StandardScaler()
df_train[fet_num_list]=pd.DataFrame(scaler.fit_transform(df_train[fet_num_list]), columns=fet_num_list)
df_test[fet_num_list]=pd.DataFrame(scaler.transform(df_test[fet_num_list]), columns=fet_num_list)

In [None]:
#Ordinal Encoding.
scii_letters_list_s=list(string.ascii_uppercase)
scii_letters_list_l=scii_letters_list_s + [i+j for i in scii_letters_list_s for j in scii_letters_list_s]

map_ord_short = dict(zip(scii_letters_list_s,range(0, len(scii_letters_list_s))))
map_ord_long = dict(zip(scii_letters_list_l,range(0, len(scii_letters_list_l))))

for cat in fet_cat_long:
    df_train[cat] = df_train[cat].replace(map_ord_long).astype(float)
    df_test[cat] = df_test[cat].replace(map_ord_long).astype(float)
    
for cat in fet_cat_short:
    df_train[cat] = df_train[cat].replace(map_ord_short).astype(float)
    df_test[cat] = df_test[cat].replace(map_ord_short).astype(float)


df_train[fet_cat_list].head(3)

In [None]:
#Check correlation after Ordinal Encoding. 
temp_col_list=[cat for cat in df_train if 'cat' in cat]
temp_col_list.append('target')
corr = abs(df_train[temp_col_list].corr())
corr.style.background_gradient(cmap='coolwarm')

In [None]:
print(abs(corr['target'].sort_values()))
cat_hi_corr=['cat16','cat15','cat18','cat14','cat2','cat11','cat0','cat17','cat13','cat4','cat6','cat1','cat9']
cat_lo_corr=['cat12','cat8','cat3','cat7','cat5','cat10']

In [None]:
#Cat Feautures relations cat13,cat14,cat15, cat16,cat17 and cat18 seems to have some relations.
df_train['fet_ext0'] = df_train['cat16']+df_train['cat17'] + df_train['cat13']   
df_train['fet_ext1'] = df_train['cat15']+df_train['cat18'] + df_train['cat14']   
df_train['fet_ext2']    = df_train['fet_ext0']-df_train['fet_ext1']


df_test['fet_ext0'] = df_test['cat16']+df_test['cat17'] + df_test['cat13']   
df_test['fet_ext1'] = df_test['cat15']+df_test['cat18'] + df_test['cat14']   
df_test['fet_ext2']    = df_test['fet_ext0']-df_test['fet_ext1']



import matplotlib.pyplot as plt
%matplotlib inline

temp_col_list=['cat15','cat16','cat17','cat18','fet_ext0','fet_ext1','fet_ext2']
#for c in df_ordinal.columns:
for c in temp_col_list:
    plt.title(c)
    x = pd.DataFrame()
    x['target'] = target
    x[c] = df_train[c]
    y = x.groupby(c).target.mean().reset_index(drop=False)

    plt.scatter(y[c],y.target)
    plt.show()

temp_col_list.append('target')
corr = df_train[temp_col_list].corr()
corr.style.background_gradient(cmap='coolwarm')
# 'RdBu_r' & 'BrBG' are other good diverging colormaps

In [None]:
#other categorical features relations
temp_col_list=[col for col in fet_cat_list if col not in ['cat13','cat14','cat15','cat16','cat17','cat18']]
new_cols=[]
for col1, col2 in itertools.combinations(temp_col_list,2):
    new_col_name = '_'.join([col1, col2])
    new_values = df_train[col1] - df_train[col2]
    new_cols.append(new_col_name)
    df_train[new_col_name] = new_values

new_cols.append('target')
corr = df_train[temp_col_list+new_cols].corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#Ordinal Features. hight correlted plus extra.
ordinal_cols=cat_hi_corr+cat_lo_corr+['fet_ext2']

#scale ordinal features.
scaler = StandardScaler()
df_train[ordinal_cols]=pd.DataFrame(scaler.fit_transform(df_train[ordinal_cols]), columns=ordinal_cols)
df_test[ordinal_cols]=pd.DataFrame(scaler.transform(df_test[ordinal_cols]), columns=ordinal_cols)

In [None]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score


params={'metric': 'auc',
        'is_unbalance': True,
        'objective':'binary',
       'reg_alpha': 7,
        'reg_lambda': 7,
        'colsample_bytree': 0.3,
       'subsample': 0.8, 
        'learning_rate': 0.01,
        'max_depth': 12, 
        'num_leaves': 72, 
        'min_child_samples': 20,
        'random_state': SEED,
        'n_estimators': 10000}

trainig_features=ordinal_cols+fet_num_list
df_train[trainig_features]
model = LGBMClassifier(**params) 
oof= np.zeros((len(df_train)))
preds = 0   


kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X=df_train[trainig_features])):
    X_train, X_valid = df_train[trainig_features].iloc[train_idx], df_train[trainig_features].iloc[valid_idx]
    y_train, y_valid = target.iloc[train_idx], target.iloc[valid_idx]

    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=200,verbose=1000) 
    oof[valid_idx] = model.predict_proba(X_valid)[:, 1]  
    preds+=model.predict_proba(df_test[trainig_features])[:, 1]/kf.n_splits 



In [None]:
print(roc_auc_score(target, oof))

In [None]:
# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,trainig_features)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features Importance')
plt.tight_layout()
plt.show()


In [None]:
sub['target']=preds
sub.to_csv('submission.csv', index=False)