In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
import gc

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, roc_auc_score

In [None]:
dirname = '/kaggle/input/porto-seguro-safe-driver-prediction'
train = pd.read_csv(os.path.join(dirname, 'train.csv'))
test = pd.read_csv(os.path.join(dirname, 'test.csv'))

print(f'shape of train: {train.shape}')
print(f'shape of test: {test.shape}')

In [None]:
train['target'].value_counts()

In [None]:
sns.countplot(train['target'])

Having huge class imbalance between positive & negative target values.

In [None]:
cat_cols = [c for c in train.columns if c.endswith('_cat')]
bin_cols = [c for c in train.columns if c.endswith('_bin')]
other_cols = [c for c in train.columns if c not in cat_cols+bin_cols]

Seperating columns as "categorical", "binary" & other types for better understanding

In [None]:
for c in cat_cols:
    neg_val = train.loc[train[c] < 0].shape[0]
    if(neg_val > 0):
        print(f'Missing value with -1 inserted in categorical columns {c}: {neg_val} -- {(neg_val * 100)/len(train):.2f}%')

As mentioned in data -1 was inserted if there is no value, which means it was a null value.
So here we found the missing values in training set categorical columns.

In [None]:
for c in other_cols:
    neg_val = train.loc[train[c] < 0].shape[0]
    if(neg_val > 0):
        print(f'Missing value with -1 inserted in numeric columns {c}: {neg_val} -- {(neg_val * 100)/len(train):.2f}%')

Missing values & there percentage in columns other than category & binary.

In [None]:
cat_cols = [c for c in cat_cols if c not in ('ps_car_03_cat','ps_car_05_cat')]

Removing columns with ~50% & above of missing values

In [None]:
df_full = pd.concat([train,test], axis=0)
df_full = df_full[cat_cols+other_cols+bin_cols]

Concatenating "train" & "test" set for doing cleanup and scaling process

In [None]:
cat_miss_cols = ['ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat',
                 'ps_car_01_cat','ps_car_02_cat','ps_car_07_cat','ps_car_09_cat']
oth_miss_cols = ['ps_car_11','ps_car_12','ps_car_14','ps_reg_03']

for i in cat_miss_cols:
    df_full.loc[df_full[i] < 0, i] = df_full[i].mode()[0]
for j in oth_miss_cols:
    df_full.loc[df_full[j] < 0, j] = df_full[j].mean()

Imputing missing values in categorical columns with mode function.
Imputing missing values in numerical columns with mean function.

In [None]:
ind_cols = [c for c in df_full.columns if re.search(r'(\_ind)', c)]
reg_cols = [c for c in df_full.columns if re.search(r'(\_reg)',c)]
car_cols = [c for c in df_full.columns if re.search(r'(\_car)',c)]
calc_cols = [c for c in df_full.columns if re.search(r'(\_calc)',c)]

#ind cols
ind_cat_cols = [ic for ic in ind_cols if re.search(r'_cat', ic)]
ind_bin_cols = [ib for ib in ind_cols if re.search(r'_bin', ib)]
ind_oth_cols = [io for io in ind_cols if io not in ind_cat_cols+ind_bin_cols]

#reg cols
reg_oth_cols = reg_cols

#car cols
car_cat_cols = [cc for cc in car_cols if re.search(r'_cat', cc)]
car_oth_cols = [co for co in car_cols if co not in car_cat_cols]

#calc cols
calc_bin_cols = [clb for clb in calc_cols if re.search(r'_bin', clb)]
calc_oth_cols = [clo for clo in calc_cols if clo not in calc_bin_cols]

#Fuction to calculate sum of related columns, as a feature engineering 
def createSum(newcol, cols):
    df_full[newcol] = np.zeros(df_full.shape[0])
    for c in cols:
        df_full[newcol] += df_full[c]
    return df_full[newcol]

createSum('ps_ind_all_cat_sum', ind_cat_cols)
createSum('ps_ind_all_bin_sum', ind_bin_cols)
createSum('ps_ind_all_oth_sum', ind_oth_cols)
createSum('ps_reg_all_oth_sum', reg_oth_cols)
createSum('ps_car_all_cat_sum', car_cat_cols)
createSum('ps_car_all_oth_sum', car_oth_cols)
createSum('ps_calc_all_bin_sum', calc_bin_cols)
createSum('ps_calc_all_oth_sum', calc_oth_cols)

1. Further seperating columns based on column name like "_ind", "_reg", "_car" & "_calc".
2. In the above group we are further seperating as "category", "binary" & "other".
3. Using this group we are creating a new column with sum of all the columns in relevant groups.

In [None]:
new_add_cols = ['ps_ind_all_cat_sum','ps_ind_all_bin_sum','ps_ind_all_oth_sum','ps_reg_all_oth_sum',
               'ps_car_all_cat_sum','ps_car_all_oth_sum','ps_calc_all_bin_sum','ps_calc_all_oth_sum']
target = ['target']
df_full_new = df_full[ind_cat_cols + 
                  ind_bin_cols + 
                  ind_oth_cols +
                  reg_oth_cols +
                  car_cat_cols +
                  car_oth_cols +
                  calc_bin_cols +
                  calc_oth_cols + 
                  new_add_cols + target]

Selecting all the group of columns with newly added columns.

In [None]:
df_train = df_full_new[:595212]
df_test = df_full_new[595212:]

df_train = df_train.sample(frac=1)
train_new_0 = df_train.loc[df_train['target'] == 0][:21694]
train_new_1 = df_train.loc[df_train['target'] == 1]
train_new = pd.concat([train_new_0,train_new_1])
df_train = train_new.sample(frac=1, random_state=42).reset_index(drop=True)

1. Spliting the full concatenated dataset as train & test.
2. Let us do a shuffle of training data using dataframe "sample".
3. To handle class imbalance, we may undersample majority class manually to make equal count of both positive & negative class.

In [None]:
df_train.target.value_counts()

Now we have equal count of positive & negative class.

In [None]:
def createDensityPlot(df, cols):
    fig = plt.figure(figsize=(18,14))
    for i,j in enumerate(cols):
        fig.add_subplot(3,2,i+1)
        sns.kdeplot(df.loc[df['target'] == 0, j], label="Target==0")
        sns.kdeplot(df.loc[df['target'] == 1, j], label="Target==1")
        plt.xlabel(j)
        plt.ylabel('Density')
    plt.show()

In [None]:
all_oth_cols1 = ['ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14','ps_car_15',
                'ps_ind_01']
all_oth_cols2 = ['ps_ind_03', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 
                'ps_reg_03', 'ps_calc_01']
all_oth_cols3 = ['ps_calc_02', 'ps_calc_03', 'ps_calc_04',
                'ps_calc_05', 'ps_calc_06', 'ps_calc_07']
all_oth_cols4 = ['ps_calc_08', 'ps_calc_09', 'ps_calc_10', 
                 'ps_calc_11', 'ps_calc_12', 'ps_calc_13']
all_oth_cols5 = ['ps_calc_14','ps_car_11_cat']

createDensityPlot(df_train, all_oth_cols1)
createDensityPlot(df_train, all_oth_cols2)
createDensityPlot(df_train, all_oth_cols3)
createDensityPlot(df_train, all_oth_cols4)
createDensityPlot(df_train, all_oth_cols5)

In [None]:
Using density plot we can an idea about how discrete & continuous value columns shows any pattern in explaining poisitive & negative classes.

In [None]:
selected_oth_cols = ['ps_car_12','ps_car_13','ps_car_14','ps_car_15','ps_ind_01',
                    'ps_ind_03', 'ps_ind_15','ps_reg_01', 'ps_reg_02', 'ps_reg_03','ps_car_11_cat']

From the density plot we choose these columns have more details between the 2 classes.

In [None]:
def createCountPlot(df, cols):
    fig = plt.figure(figsize=(15,10))
    for i,j in enumerate(cols):
        fig.add_subplot(3,2,i+1)
        sns.countplot(x=j, data=df, hue='target')
    plt.show()

In [None]:
all_cat_col1 = ['ps_car_01_cat','ps_car_02_cat','ps_car_04_cat','ps_car_06_cat','ps_car_07_cat','ps_car_08_cat']
all_cat_col2 = ['ps_car_09_cat','ps_car_10_cat','ps_car_11_cat','ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat']

createCountPlot(df_train, all_cat_col1)
createCountPlot(df_train, all_cat_col2)

Here creating count plot on "categorical" columns for better understanding the class difference on these columns. 

In [None]:
all_bin_cols1 = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin']
all_bin_cols2 = ['ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin']
all_bin_cols3 = ['ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']

createCountPlot(df_train, all_bin_cols1)
createCountPlot(df_train, all_bin_cols2)
createCountPlot(df_train, all_bin_cols3)

Here creating count plot on "binary" columns for understanding the class difference on these columns.

In [None]:
selected_cat_cols = ['ps_car_01_cat','ps_car_02_cat','ps_car_04_cat','ps_car_06_cat','ps_car_09_cat','ps_ind_04_cat','ps_ind_05_cat']
selected_bin_cols = ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin','ps_ind_16_bin', 'ps_ind_17_bin']

Selected the columns which are having difference between the classes.

In [None]:
df_train_new=pd.get_dummies(data=df_train, columns=selected_cat_cols, drop_first=True)
df_test_new=pd.get_dummies(data=df_test, columns=selected_cat_cols, drop_first=True)

Encoding the categorical columns as a standard procedure for training & test set.

In [None]:
correlation = df_train_new.drop(['target'], axis=1).corr()
upper = correlation.where(np.triu(np.ones(correlation.shape), k=1).astype(np.bool))
to_drop = [col for col in upper.columns if any (upper[col].abs() > 0.9)]
print(f'collinear columns count: {len(to_drop)}')

Checking inter collinearity between columns, by fixing the threshold of greater than 0.9

In [None]:
scaler = StandardScaler()
df_cols = df_train_new.columns
df_labels = df_train_new['target']
df_cols = [c for c in df_cols if c not in ['id','target','ps_reg_all_oth_sum','ps_car_all_cat_sum']]
df_scaled = scaler.fit_transform(df_train_new[df_cols])
df_new = pd.DataFrame(df_scaled, columns=df_cols).reset_index(drop=True)
df_new['target'] = df_labels.values

Scaling the traing data as a standard step. 

In [None]:
test_new = scaler.transform(df_test_new[df_cols])
df_new_test = pd.DataFrame(test_new, columns=df_cols).reset_index(drop=True)

Transforming the test set based on training data.

In [None]:
X = df_new.drop('target', axis=1)
y = df_new['target']

Seperating the training data as 'X' & 'y'.

In [None]:
import gc

del df_full
del df_scaled
del df_new
del df_train
del train_new_0
del train_new_1
del train_new
del test_new
del train
del test
del df_train_new
del df_test_new
del correlation
gc.collect()

In [None]:
log_reg = LogisticRegression()
rfecv = RFECV(estimator=log_reg, step=1, cv=StratifiedKFold(5), scoring='roc_auc')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

Using sklearns "Recursive feature elimination" to select optimal useful columns.

In [None]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
X = X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1)

In [None]:
folds = StratifiedKFold(n_splits= 10, shuffle=True, random_state=1001)

lgb_cla = LGBMClassifier(nthread=4,
                         n_estimators=10000,
                         learning_rate=0.01,
                         num_leaves=34,
                         colsample_bytree=0.9497036,
                         subsample=0.8715623,
                         max_depth=8,
                         reg_alpha=0.041545473,
                         reg_lambda=0.0735294,
                         min_split_gain=0.0222415,
                         min_child_weight=39.3259775,
                         silent=-1,
                         verbose=-1, )

# Create arrays and dataframes to store results
oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(df_new_test.shape[0])
feature_importance_df = pd.DataFrame()

feats = X.columns   

for n_fold, (train_idx, test_idx) in enumerate(folds.split(X, y)):
    train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
    test_x, test_y = X.iloc[test_idx], y.iloc[test_idx]

lgb_cla.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)],
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 500)

oof_preds[test_idx] = lgb_cla.predict_proba(test_x, num_iteration=lgb_cla.best_iteration_)[:, 1]
sub_preds += lgb_cla.predict_proba(df_new_test[feats], num_iteration=lgb_cla.best_iteration_)[:, 1] / folds.n_splits

fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = lgb_cla.feature_importances_
fold_importance_df["fold"] = n_fold + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(test_y, oof_preds[test_idx])))

In [None]:
test = pd.read_csv(os.path.join(dirname, 'test.csv'))
test['target'] = sub_preds
test[['id','target']].to_csv('submission_02.csv', index=False)