In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%config Completer.use_jedi = False

In [None]:
data_test = pd.read_csv("../input/santander-customer-transaction-prediction-dataset/test.csv")
data_train = pd.read_csv("../input/santander-customer-transaction-prediction-dataset/train.csv")

In [None]:
# Print all the columns which is not present in test data but present in training data

for col in data_train.columns:
    if col not in data_test.columns:
        print("`{}` is not present in test data".format(col))

All the columns except the `target` column is present in test data. So we don't need to drop any columns in training data. 

In [None]:
data_train.head()

In [None]:
data_train.info()

Training data contains all the variables that are continous except `ID_code` and `target` . No categorical variables. 

In [None]:
data_test.info()

In [None]:
print(data_train.shape)
print(data_test.shape)

In [None]:
idx = data_train.target.value_counts().index
vals = data_train.target.value_counts().values
fig, ax = plt.subplots()
explode = (0.1, 0)
ax.pie(vals, labels=idx, explode=explode, autopct='%1.1f%%')
ax.axis('equal')
ax.set_title('Santanber target labels')
plt.show()

As we see, the training data is heavily unbalanced, we've only 10% of `label 1`. This may require further action.

In [None]:

corr = data_train.corr().abs()
corr[corr == 1] = 0
s = corr.unstack().sort_values(ascending=False)


In [None]:
print(s.head())

As we see, the maximum correlation is in the scale of 0.08 which is near to zero, thus we affirm that the is no such noticable correlation with features. 

In [None]:
data_train.columns

In [None]:
skewList = []
for colName in data_train.columns:
    if colName not in ['ID_code', 'target']:
        skewList.append([colName, abs(data_train[colName].skew())])

skewList.sort(key=lambda x: x[1], reverse=True)

skewdf = pd.DataFrame.from_records(skewList, columns=['colName', 'Skewness'])

In [None]:
print(skewdf.head(10))

The maximus skewness value is .34 which is in the range from -0.5 to 0.5, that means it is almost symmetrical. Let's plot `var_44` and confirm it. 

In [None]:
fig, ax = plt.subplots( figsize=(4,4), )

sns.distplot(data_train["var_44"], ax=ax, color='r')
ax.set_title('Distribution of var_44', fontsize=14)
ax.set_xlim([min(data_train["var_44"]), max(data_train["var_44"])])
fig.show()

In [None]:
var = 'var_44'
tmp = pd.concat([data_train['target'], data_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x='target', y=var, data=tmp)
fig.axis(ymin= min(data_train[var]), ymax=max(data_train[var]));
f.show()

Thus we don't have any issue with skewness. 

In [None]:
#missing data
total = data_train.isnull().sum().sort_values(ascending=False)
percent = (data_train.isnull().sum()/data_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head()

There is no missing data as well.

In [None]:
print("Number of unique values in ID_code: ", data_train.ID_code.nunique())

All the values are unique in ID column, so we may use it as an index. Thus it can be dropped when modelling.

In [None]:
non_transaction_df = data_train.loc[data_train['target'] == 0]
non_transaction_df.shape

In [None]:
transaction_df = data_train.loc[data_train['target'] == 1]
transaction_df.shape[0]

In [None]:
len(transaction_df)

In [None]:
# Since our classes are highly skewed we should make them equivalent in order to have 
# a normal distribution of the classes.

# Lets shuffle the data before creating the subsamples

# frac =1 sampling will help us to shuffle the dataframe
data_train = data_train.sample(frac=1)

# amount of fraud classes 20098 rows.
transaction_df = data_train.loc[data_train['target'] == 1]
non_transaction_df = data_train.loc[data_train['target'] == 0][:len(transaction_df)]

print("Shape of transaction df: ", transaction_df.shape)
print("Shape of non transaction df: ", non_transaction_df.shape)

normal_distributed_df = pd.concat([transaction_df, non_transaction_df])
# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)

print("Balanced data set dimension: ", new_df.shape)

In [None]:
new_df.drop("ID_code", inplace=True, axis=1)
data_test.drop("ID_code", inplace=True, axis=1)

In [None]:
new_df.shape

In [None]:
data_test.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold, StratifiedKFold
import numpy as np

X = new_df.drop('target', axis=1)
y = new_df['target']

sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    print('-' * 100)
    print("\nShape of original_Xtrain: ", original_Xtrain.shape)
    print("\nShape of original_Xtest: ", original_Xtest.shape)
    print('-' * 100)
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# We already have X_train and y_train for undersample data thats why I am using original to distinguish 
# and to not overwrite these variables.
# original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the Distribution of the labels


# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)

print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
   
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
log_reg = LogisticRegression()
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
title = "Santanber training results"
plot_learning_curve(log_reg, title, X.values, y.values, None, cv=cv,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5))