In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn

pd.set_option('display.max_columns', None)

train_data = pd.read_csv(r"/kaggle/input/santander-customer-transaction-prediction/train.csv")
test_data = pd.read_csv(r"/kaggle/input/santander-customer-transaction-prediction/test.csv")

train_X = train_data.drop(columns=['ID_code','target'])
train_y = train_data['target']
test_X = test_data.drop(columns=['ID_code'])

seed = 12

In [None]:
# random forest feature importance

from sklearn.ensemble import RandomForestClassifier
import gc

gc.collect()

print('start fitting random forest model')
forest = RandomForestClassifier(random_state=seed, n_jobs=-1, max_depth=4)
%timeit forest.fit(train_X, train_y)

feature_importance = forest.feature_importances_

In [None]:
feature_importance_map = {}
for idx, score in enumerate(feature_importance):
    feature_importance_map['var_%d' % idx] = score

feature_importance_map = {k:v for k,v in sorted(feature_importance_map.items(), key=lambda x: x[1], reverse=True)}

In [None]:
import seaborn as sns

fig, ax = plt.subplots(figsize=(20,6))

sns.barplot(x=list(feature_importance_map.keys())[:50], y=list(feature_importance_map.values())[:50], ax=ax)
plt.xticks(rotation=45)
plt.show()

In [None]:
## Let's plot the 5 best variable selected by the random forest model
cols = list(feature_importance_map.keys())[:5]
cols

In [None]:
## violin

fig, ax = plt.subplots(2,1, figsize=(20,20))
train_df = train_data[cols + ['target']].melt(id_vars = ['target'], var_name = 'Vars', value_name = 'Values')
sns.violinplot(x="Vars",y="Values",data=train_df, hue = 'target', split=True, inner="quart", ax=ax[0])
ax[0].set_title("violin plot for train data")

test_df = test_data[cols].melt(var_name = 'Vars', value_name = 'Values')
sns.violinplot(x="Vars",y="Values",data=test_df, inner="quart", ax=ax[1])
ax[1].set_title("violin plot for test data")
plt.show()

The kernel distributions look different between the training and test set, but the quartiles and mean are close.  
We also notice that in the violin plot for the training data, the distribution of target=1 and target=0 are quite different from each other. These 5 variables will be useful when fitting models.

In [None]:
## quintile based binning and one-hot encoding
train_X_copy = train_X.copy()

for col in cols:
    train_X_copy[col + '_bin'] = pd.qcut(train_X_copy[col], 5, labels=False)
    train_X_copy = pd.concat((train_X_copy, pd.get_dummies(data=train_X_copy[col+'_bin'], prefix=col)),1)
    
train_X_copy = train_X_copy.drop(columns=cols)  
train_X_copy = train_X_copy.drop(columns=[col + '_bin' for col in cols])

In [None]:
train_X_copy.head()

In [None]:
# code below are copied from the weighted logistic regression notebook
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

scale = np.logspace(0,2,num=10)
grid = [{0:1, 1:weight} for weight in scale]
grid = {"class_weight": grid }
l2_model = LogisticRegression(penalty='l2', random_state=25, solver='sag')
grid_search = GridSearchCV(l2_model,grid,scoring="roc_auc", n_jobs=-1, refit=True, verbose=2)

print(grid)
print("-----------------Start Fitting Weighted Logistic Regression Model With L-2 regularization-----------------")
grid_search.fit(train_X_copy, train_y)

train_pred = grid_search.predict(train_X_copy)

In [None]:
## compute the confusion matrix for binary classification problem
def confusion_matrix(y, y_pred, threshold=None):
    assert(len(y) == len(y_pred))
    
    if threshold is not None:
        y_pred = y_pred >= threshold
    
    
    #  TP  |  FN
    #  ---------
    #  FP  |  TN
    
    mat = np.zeros((2,2))
    
    for i in np.arange(len(y)):
        if y_pred[i] == y[i]:
            if y[i] == 1:
                ## TP
                mat[0,0] = mat[0,0] + 1
            if y[i] == 0:
                ## TN
                mat[1,1] = mat[1,1] + 1
        else:
            if y[i] == 1:
                ## FN
                mat[0,1] = mat[0,1] + 1
            if y[i] == 0:
                ## FP:
                mat[1,0] = mat[1,0] + 1
    
    return mat

In [None]:
confusion_mat = confusion_matrix(train_y, train_pred)
confusion_mat

In [None]:
tpr = confusion_mat[0,0] / (confusion_mat[0,0] + confusion_mat[1,1])
fpr = confusion_mat[1,0] / (confusion_mat[1,1] + confusion_mat[1,0])
precision = confusion_mat[0,0] / (confusion_mat[0,0] + confusion_mat[1,0])
print("True positive rate: " + str(tpr))
print("False positive rate: " + str(fpr))
print("precision" + str(precision))

In [None]:
## quintile based binning and one-hot encoding for test data
test_X_copy = test_X.copy()

for col in cols:
    test_X_copy[col + '_bin'] = pd.qcut(test_X_copy[col], 5, labels=False)
    test_X_copy = pd.concat((test_X_copy, pd.get_dummies(data=test_X_copy[col+'_bin'], prefix=col)),1)
    
test_X_copy = test_X_copy.drop(columns=cols)  
test_X_copy = test_X_copy.drop(columns=[col + '_bin' for col in cols])

In [None]:
### submission
### https://www.kaggle.com/dansbecker/submitting-from-a-kernel

test_pred = grid_search.predict(test_X_copy)
my_submission = pd.DataFrame({'ID_code': test_data.ID_code, 'target': test_pred})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)