In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import f1_score, roc_auc_score

In [None]:
dirname = '/kaggle/input/porto-seguro-safe-driver-prediction'
train = pd.read_csv(os.path.join(dirname, 'train.csv'))
test = pd.read_csv(os.path.join(dirname, 'test.csv'))

print(f'shape of train: {train.shape}')
print(f'shape of test: {test.shape}')

In [None]:
train.head()

In [None]:
train.describe().T

In [None]:
sns.countplot(train['target'])

In [None]:
train['target'].value_counts()

In [None]:
cat_cols = [c for c in train.columns if c.endswith('_cat')]
bin_cols = [c for c in train.columns if c.endswith('_bin')]
other_cols = [c for c in train.columns if c not in cat_cols+bin_cols]

In [None]:
df_full = pd.concat([train,test], axis=0)

In [None]:
for c in cat_cols:
    neg_val = train.loc[train[c] < 0].shape[0]
    if(neg_val > 0):
        print(f'Missing value with -1 inserted in categorical columns {c}: {neg_val} -- {(neg_val * 100)/len(train):.2f}%')

In [None]:
for c in other_cols:
    neg_val = train.loc[train[c] < 0].shape[0]
    if(neg_val > 0):
        print(f'Missing value with -1 inserted in numeric columns {c}: {neg_val} -- {(neg_val * 100)/len(train):.2f}%')

In [None]:
cat_cols = [c for c in cat_cols if c not in ('ps_car_03_cat','ps_car_05_cat')]
#other_cols = [o for o in other_cols if o not in ('ps_reg_03')]

In [None]:
df_full = df_full[cat_cols+other_cols+bin_cols]

In [None]:
cat_miss_cols = ['ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat',
                 'ps_car_01_cat','ps_car_02_cat','ps_car_07_cat','ps_car_09_cat']
oth_miss_cols = ['ps_car_11','ps_car_12','ps_car_14','ps_reg_03']

In [None]:
for i in cat_miss_cols:
    df_full.loc[df_full[i] < 0, i] = df_full[i].mode()[0]

In [None]:
for j in oth_miss_cols:
    df_full.loc[df_full[j] < 0, j] = df_full[j].mean()

In [None]:
df_full=pd.get_dummies(data=df_full, columns=cat_cols, drop_first=True)

In [None]:
scaler = StandardScaler()
df_cols = df_full.columns
df_labels = df_full['target']
df_cols = [c for c in df_cols if c not in ['id','target']]
df_scaled = scaler.fit_transform(df_full[df_cols])
df_new = pd.DataFrame(df_scaled, columns=df_cols).reset_index(drop=True)

In [None]:
df_new['target'] = df_labels.values

In [None]:
train = df_new[:595212]
test = df_new[595212:]

In [None]:
train = train.sample(frac=1)
train_new_0 = train.loc[train['target'] == 0][:21694]
train_new_1 = train.loc[train['target'] == 1]
train_new = pd.concat([train_new_0,train_new_1])
train_new = train_new.sample(frac=1, random_state=42).reset_index(drop=True)
train_new.head()

In [None]:
test_new = test.drop(columns=['target'], axis=1).reset_index(drop=True)

In [None]:
import gc

del df_full
del train
del test
gc.collect()

In [None]:
correlation = train_new.drop(['target'], axis=1).corr()

In [None]:
upper = correlation.where(np.triu(np.ones(correlation.shape), k=1).astype(np.bool))
to_drop = [col for col in upper.columns if any (upper[col].abs() > 0.9)]
print(f'collinear columns count: {len(to_drop)}')

In [None]:
X = train_new.drop(columns=['target'], axis=1)
y = train_new['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
log_reg = LogisticRegression()
rfecv = RFECV(estimator=log_reg, step=1, cv=StratifiedKFold(10), scoring='accuracy')
rfecv.fit(X_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

In [None]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
X_train.columns[np.where(rfecv.support_ == False)[0]]

In [None]:
X = train_new.drop(train_new.columns[np.where(rfecv.support_ == False)[0]], axis=1)
X = X.drop(columns=['target'], axis=1)

#y = train_new['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
log_reg = LogisticRegression(C = 0.0001)
#rfc = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

cv = StratifiedKFold(n_splits=10,random_state=22,shuffle=True)
score_avg = []
for (train, test) in cv.split(X_train, y_train):
    log_reg.fit(X_train.iloc[train], y_train.iloc[train])
    prediction = log_reg.predict(X_train.iloc[test])
    score_avg.append(f1_score(prediction, y_train.iloc[test]))
print(pd.DataFrame(data=score_avg))
print(sum(score_avg)/len(score_avg))

In [None]:
from sklearn.metrics import f1_score, roc_auc_score

def evaluate_model(model, X_test, y_test):
    prediction = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, proba) 
    f1_value = f1_score(y_test, prediction)
    accuracy = np.mean(prediction == y_test)
    
    base_accuracy = np.mean(y_test==0)
    print(f'ROC score: {roc_auc:.4f}')
    print(f'F1 score: {f1_value}')
    print(f'Accuracy: {100 * accuracy:.2f}%')
    print(f'Base accuracy: {100 * base_accuracy:.2f}%')

In [None]:
evaluate_model(log_reg, X_test, y_test)

In [None]:
from lightgbm import LGBMClassifier

folds = StratifiedKFold(n_splits= 10, shuffle=True, random_state=1001)

X = train_new.drop(columns=['target'], axis=1)
y = train_new['target']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

lgb_cla = LGBMClassifier(nthread=4,
                         n_estimators=10000,
                         learning_rate=0.02,
                         num_leaves=34,
                         colsample_bytree=0.9497036,
                         subsample=0.8715623,
                         max_depth=8,
                         reg_alpha=0.041545473,
                         reg_lambda=0.0735294,
                         min_split_gain=0.0222415,
                         min_child_weight=39.3259775,
                         silent=-1,
                         verbose=-1, )

# Create arrays and dataframes to store results
oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(test_new.shape[0])
feature_importance_df = pd.DataFrame()

feats = X.columns   

for n_fold, (train_idx, test_idx) in enumerate(folds.split(X, y)):
    train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
    test_x, test_y = X.iloc[test_idx], y.iloc[test_idx]

lgb_cla.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)],
            eval_metric= 'auc', verbose= 200, early_stopping_rounds= 200)

oof_preds[test_idx] = lgb_cla.predict_proba(test_x, num_iteration=lgb_cla.best_iteration_)[:, 1]
sub_preds += lgb_cla.predict_proba(test_new[feats], num_iteration=lgb_cla.best_iteration_)[:, 1] / folds.n_splits

fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = lgb_cla.feature_importances_
fold_importance_df["fold"] = n_fold + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(test_y, oof_preds[test_idx])))

In [None]:
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()
display_importances(feature_importance_df)

In [None]:
test = pd.read_csv(os.path.join(dirname, 'test.csv'))
test['target'] = sub_preds
test[['id','target']].to_csv('submission_01.csv', index=False)