In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import dask.dataframe as dd
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import cuml
import cudf  # looks and feels like Pandas, but runs on the GPU
from cuml.preprocessing.TargetEncoder import TargetEncoder
from cuml.preprocessing.model_selection import train_test_split
from cuml.metrics import accuracy_score,roc_auc_score

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt

In [None]:
train_dir = '/kaggle/input/tabular-playground-series-mar-2021/train.csv'
test_dir = '/kaggle/input/tabular-playground-series-mar-2021/test.csv'

In [None]:
%%time
df_dd = dd.read_csv(train_dir).compute()

In [None]:
%%time
df_cudf = cudf.read_csv(train_dir)

In [None]:
train_df = pd.read_csv(train_dir)
train_df.drop(columns = 'id', axis = 1, inplace = True)
test_df = pd.read_csv(test_dir)
ids = test_df['id']
test_df.drop(columns = 'id', axis = 1, inplace = True)

In [None]:
train_df.head()

In [None]:
train_df.describe()

# EDA

In [None]:
dct = {}
for trgt in train_df['target']:
    if(trgt not in dct):
        dct[trgt] = 1
    else:
        dct[trgt] += 1

In [None]:
plt.figure(figsize = (7,7))
plt.pie(dct.values(), labels = [f'0 : {dct[0]/3000} %', f'1 : {dct[1]/3000} %'])

my_circle = plt.Circle( (0,0), 0.7, color='white')
fig = plt.gcf()
fig.gca().add_artist(my_circle)
plt.show()

In [None]:
categorical_columns = []
for i in range(19):
    categorical_columns.append('cat'+str(i))
numerical_columns = []
for i in range(11):
    numerical_columns.append('cont'+str(i))


In [None]:
num_rows, num_cols = 6,2
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(16, 24))
f.suptitle('Distribution of Features', fontsize=26)

for index, column in enumerate(train_df[numerical_columns].columns):
    i,j = (index // num_cols, index % num_cols)
    sns.kdeplot(train_df.loc[train_df['target'] == 0, column], color = "m", shade = True, ax = axes[i,j])
    sns.kdeplot(train_df.loc[train_df['target'] == 1, column], color = "b", shade = True, ax = axes[i,j])

f.delaxes(axes[5, 1])
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (30,35))
corr = train_df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask = mask, cmap = 'spring', vmax = .3, center = 0,
            square = True, linewidths = .5, annot=True, annot_kws={"fontsize":18})
plt.show()

In [None]:
 for index, column in enumerate(train_df[categorical_columns].columns):
    index_0 = train_df.loc[train_df['target'] == 0, column].value_counts().reset_index()["index"].values
    values_0 = train_df.loc[train_df['target'] == 0, column].value_counts().reset_index()['cat'+str(index)].values
    index_1 = train_df.loc[train_df['target'] == 1, column].value_counts().reset_index()["index"].values
    values_1 = train_df.loc[train_df['target'] == 1, column].value_counts().reset_index()['cat'+str(index)].values
    if len(values_0)>10:
        index_0 = index_0[:50]
        values_0 = values_0[:50]
    if len(values_1)>10:
        index_1 = index_1[:50]
        values_1 = values_1[:50]
        
    plt.figure(figsize = (18, 8))
    
    sns.barplot(x = index_0, y = values_0, palette = 'spring')
    sns.barplot(x = index_1, y = values_1, palette = 'spring')
    
    plt.title(categorical_columns[index], fontsize=15)
plt.show()    

# Feature Engineering

In [None]:
from copy import deepcopy

df_0 = train_df.loc[train_df['target'] == 0]
df_1 = train_df.loc[train_df['target'] == 1]

for index, column in enumerate(train_df[categorical_columns].columns):
    data = df_0.groupby(column)[column].count().sort_values(ascending=False)
    if len(data) < 10:
        continue
    # data = data if len(data) < 25 else data[:25]
    
    target_0_values = set(deepcopy(data.index))
    
    data = df_1.groupby(column)[column].count().sort_values(ascending=False)
    # data = data if len(data) < 25 else data[:25]
    
    target_1_values = set(deepcopy(data.index))
    
    print('-------------------   {}   ---------------------'.format(column))
    print('Unique values for class 0: {}'.format(target_0_values - target_1_values))
    print('Unique values for class 1: {}'.format(target_1_values - target_0_values))


In [None]:
# Fix cat5 variable
train_df['cat5'] = train_df['cat5'].apply(lambda x: x if x not in 
                                          ['AG', 'CB', 'BP', 'ZZ', 'BM', 'BX', 'AK', 'B'] 
                                          else 'B')
test_df['cat5'] = test_df['cat5'].apply(lambda x: x if x not in 
                                        ['AG', 'CB', 'BP', 'ZZ', 'BM', 'BX', 'AK', 'B'] 
                                        else 'B')

# Fix cat8 variable
train_df['cat8'] = train_df['cat8'].apply(lambda x: x if x not in ['AC', 'P'] else 'P')
test_df['cat8'] = test_df['cat8'].apply(lambda x: x if x not in ['AC', 'P'] else 'P')

# Fix cat10 variable
train_df['cat10'] = train_df['cat10'].apply(lambda x: x if x not in 
                                            ['HF', 'KK', 'GD', 'JE', 'KD', 'MA', 'BA', 'DT', 
                                             'LK', 'GR', 'KU', 'MW', 'LR', 'ME', 'CN', 'JF', 
                                             'DA', 'JC', 'IU', 'GV', 'ED', 'EB', 'IL', 'EF', 
                                             'BD', 'GG', 'CM', 'CH', 'EG', 'FA', 'KN', 'IM', 
                                             'DU', 'IN', 'HI', 'DX', 'IP', 'DM', 'CF', 'MO', 
                                             'DL', 'KI', 'FW', 'GH', 'MP', 'MR', 'BO', 'IY', 
                                             'CQ', 'GF', 'AF', 'CX', 'MQ', 'GJ', 'FF', 'LT', 
                                             'AJ', 'IQ', 'HY', 'LH', 'DN', 'MK', 'GY', 'BS', 
                                             'DK', 'AW', 'JU', 'BX', 'CT', 'EH', 'ML', 'EN', 
                                             'MU', 'MI'] else 'MI')
test_df['cat10'] = test_df['cat10'].apply(lambda x: x if x not in 
                                        ['HF', 'KK', 'GD', 'JE', 'KD', 'MA', 'BA', 'DT', 
                                         'LK', 'GR', 'KU', 'MW', 'LR', 'ME', 'CN', 'JF', 
                                         'DA', 'JC', 'IU', 'GV', 'ED', 'EB', 'IL', 'EF', 
                                         'BD', 'GG', 'CM', 'CH', 'EG', 'FA', 'KN', 'IM', 
                                         'DU', 'IN', 'HI', 'DX', 'IP', 'DM', 'CF', 'MO', 
                                         'DL', 'KI', 'FW', 'GH', 'MP', 'MR', 'BO', 'IY', 
                                         'CQ', 'GF', 'AF', 'CX', 'MQ', 'GJ', 'FF', 'LT', 
                                         'AJ', 'IQ', 'HY', 'LH', 'DN', 'MK', 'GY', 'BS', 
                                         'DK', 'AW', 'JU', 'BX', 'CT', 'EH', 'ML', 'EN', 
                                         'MU', 'MI'] else 'MI')

I used **Target encoding** which works by averaging the target value by category.

* Target encoding is a fast way to get the most out of your categorical variables with little effort. The idea is quite simple. Say you have a categorical variable X and a target y – y can be binary or continuous, it doesn’t matter. For each distinct element in X you’re going to compute the average of the corresponding values in y. Then you’re going to replace each Xi with the according mean.

Here **stratify** parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter **stratify**.

For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's

In [None]:
encoder = TargetEncoder()

X = cudf.DataFrame(train_df.drop(["target"],axis=1))
y = cudf.Series(train_df["target"])
test_df = cudf.DataFrame(test_df)

for col in train_df.columns:
    if train_df[col].dtype=='object':
        print(col)
        X[col] = encoder.fit_transform(X[col],y)
        test_df[col] = encoder.transform(test_df[col])

# Modelling

In [None]:
from lightgbm import LGBMClassifier
import xgboost
import lightgbm
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# LGBM

In [None]:
k_fold = 8
skf = StratifiedKFold(n_splits=k_fold)

In [None]:
params = {'n_estimators': 10000,
 'learning_rate': 0.05,
 'metric': 'auc',
 'num_leaves': 708,
 'max_depth': 31,
 'reg_alpha': 11.308,
 'reg_lambda': 15.091,
 'colsample_bytree': 0.233,
 'force_col_wise': True,
 'cat_smooth': 39.657}

for i, (train_index, test_index) in enumerate(skf.split(X, y.to_array())):
    print('[Fold %d/%d]' % (i + 1, k_fold))
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into LGBM format
    d_train = lightgbm.Dataset(X_train.to_pandas(), label = y_train.to_pandas())
    d_valid = lightgbm.Dataset(X_valid.to_pandas(), label = y_valid.to_pandas())
    d_test = lightgbm.Dataset(test_df)

    mdl = lightgbm.train(params, d_train, 1000, valid_sets = [d_train, d_valid], 
                         early_stopping_rounds=300, verbose_eval=500)

    # Predicting...
    p_test = mdl.predict(d_test)
    sub['target_'+str(k)] += p_test/k_fold

In [None]:
lgbm = LGBMClassifier()

lgbm.fit(X_train.as_matrix(), 
         y_train.to_array(), 
         eval_set = (X_valid.as_matrix(), y_valid.to_array()), 
         verbose = True)
predictions = lgbm.predict_proba(X_valid.as_matrix())[:,1]

auc = roc_auc_score(y_valid, predictions)

print(f'LGBM Score: {auc}')

In [None]:
preds_lgbm = lgbm.predict_proba(test_df.as_matrix())[:,1]

In [None]:
preds_lgbm

In [None]:
df_sub = {'id': ids, 'target': preds_lgbm}
df_predictions = cudf.DataFrame(df_sub).set_index(['id'])

df_predictions.to_csv('/kaggle/working/predictions_lgbm.csv')

# XGBoost

In [None]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

In [None]:
k_fold = 8
skf = StratifiedKFold(n_splits=k_fold)

In [None]:
sub = pd.DataFrame()
sub['id'] = ids
sub['target'] = np.zeros_like(ids)

In [None]:
params =   {'objective': 'binary:logistic',
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'eval_metric': 'auc',
        'random_state': 1,
        'max_depth': 12,
        'learning_rate': 0.03,
        'min_child_weight': 20,
        'gamma': 0.1,
        'alpha': 0.2,
        'lambda': 9,
        'colsample_bytree': 0.2,
        'subsample': 0.8}
for i, (train_index, test_index) in enumerate(skf.split(X, y.to_array())):
    print('[Fold %d/%d]' % (i + 1, k_fold))
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgboost.DMatrix(X_train, y_train)
    d_valid = xgboost.DMatrix(X_valid, y_valid)
    d_test = xgboost.DMatrix(test_df)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 1,600 rounds (with early stopping after 70)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgboost.train(params, d_train, 1600, watchlist, early_stopping_rounds=70, feval=gini_xgb, maximize=True, verbose_eval=500)

    # Predicting...
    p_test = mdl.predict(d_test, ntree_limit=mdl.best_ntree_limit)
    sub['target'] += p_test/k_fold

In [None]:
target_ = sub['target']
df_sub = {'id': ids, 'target': target_}
df_predictions = cudf.DataFrame(df_sub).set_index(['id'])

df_predictions.to_csv('/kaggle/working/predictions_xgb_.csv')