In [138]:
import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

from scipy.stats import skew # the target will be log-transformed and so will be some features

from time import time
import warnings
warnings.filterwarnings('ignore')


In [139]:
# Read the data from the data file
tic = time()
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
toc = time()
print('time to read data into DataFrame = ', toc-tic, 'sec.') ## 16.8 sec

time to read data into DataFrame =  17.63637137413025 sec.


In [140]:
print(train_df.head(2))
tic = time()
train_df.index = train_df.id
test_df.index = test_df.id
train_df.drop('id', axis = 1, inplace=True)
test_df.drop('id', axis = 1, inplace=True)
toc = time()
print('time to change index to id and drop id column = ', toc-tic, 'sec.') ## 2.0 sec

   id  target  ps_ind_01  ps_ind_02_cat  ps_ind_03  ps_ind_04_cat  \
0   7       0          2              2          5              1   
1   9       0          1              1          7              0   

   ps_ind_05_cat  ps_ind_06_bin  ps_ind_07_bin  ps_ind_08_bin       ...        \
0              0              0              1              0       ...         
1              0              0              0              1       ...         

   ps_calc_11  ps_calc_12  ps_calc_13  ps_calc_14  ps_calc_15_bin  \
0           9           1           5           8               0   
1           3           1           1           9               0   

   ps_calc_16_bin  ps_calc_17_bin  ps_calc_18_bin  ps_calc_19_bin  \
0               1               1               0               0   
1               1               1               0               1   

   ps_calc_20_bin  
0               1  
1               0  

[2 rows x 59 columns]
time to change index to id and drop id column =  

In [141]:
tic = time()
X_all = pd.concat( (train_df.drop('target', axis = 1), test_df) )
y_train = train_df.target
del train_df # release workspace
del test_df # release workspace
toc = time()
print('time to concat train and test X dsta = ', toc-tic, 'sec.') # 3.63 sec
print(X_all.shape)
print(y_train.shape)

time to concat train and test X dsta =  4.212993144989014 sec.
(1488028, 57)
(595212,)


In [142]:
# Need to change the dtype of all variables whose names end in _cat
# Extract numerical columns
def parse_feature_names(df):
    cat_columns = []
    num_columns = []
    col_names = df.columns
    for col in col_names:
        split_list = col.split('_')
        if 'cat' in split_list:
            cat_columns.append(col)
        if ('cat' not in split_list) and ('bin' not in split_list):
            num_columns.append(col)
    
    return cat_columns, num_columns
                              
cat_features, num_features = parse_feature_names(X_all)  # We are leaving binary features as 0/1      

In [143]:
# Figuring out if any columns need to be log-transformed
skew_vals = X_all[num_features].apply(lambda x: skew(x.dropna())) #compute skewness
skew_vals_filtered1 = skew_vals[ skew_vals > 1 ] # The threshold is arbitrary but large enough
# This results in some binary type data to also show up as highly skewed. Eliminate them:
skew_vals_filtered = skew_vals_filtered1[ skew_vals_filtered1 < 10]
print(skew_vals_filtered) # index of skew_vals_filtered are the names of those features

ps_reg_02    1.280007
ps_car_12    1.092367
ps_car_13    1.697220
dtype: float64


In [144]:
%matplotlib
import seaborn as sns
# sns.set(color_codes=True)
# sns.distplot( X_all['ps_reg_02'] )

Using matplotlib backend: MacOSX


In [145]:
# define poisson function, parameter lamb (can't use reserved word lambda) is the fit parameter
# from scipy.misc import factorial
# from scipy.optimize import curve_fit
# def poisson(k, lamb):
#     return (lamb**k/factorial(k)) * np.exp(-lamb)
# def exponential(k, lamb):
#     return lamb * np.exp(-lamb * k)
# # fit with curve_fit

# hist = np.histogram(X_all['ps_reg_02'], bins=40, density = True)
# s = pd.Series( X_all['ps_reg_02'].value_counts() )
# s = s.sort_index()
# bin_edges = s.index.ravel()
# hist =  s.values.ravel()
# area = 0
# for i in range(len(bin_edges)):
#     if i < len(bin_edges) - 1:
#         area += ( hist[i]  + hist [i+1] ) * ( bin_edges[i+1] - bin_edges[i] ) /2
#     else:
#         area += hist[i] * ( bin_edges[i] - bin_edges[i-1] ) /2
# hist = hist/area
# params, cov_matrix = curve_fit(exponential, bin_edges, hist)

# # %matplotlib

# width = 0.2
# hist_ = pd.DataFrame( {'histogram' : [bin_edges, hist] ,})
# fit_ = pd.DataFrame( { 'fit' : [bin_edges, exponential(bin_edges, params)] ,})

# plt.bar(bin_edges, hist, width = width )
# plt.plot(bin_edges, exponential(bin_edges, params), color = 'r', lw = 3)
# plt.show()


In [146]:
# replace categorical features by one-hot columns - ignoring the NaNs
X_all = pd.get_dummies(X_all)

# Check for NaNs
print('number of feartures with NaN = ', X_all.isnull().any().sum())
# What to do? A simple solution is to impute by mean
X_all = X_all.fillna(X_all.mean())
print('number of features with NaN = ', X_all.isnull().any().sum())
print("number of y's with NaN = ", y_train.isnull().any().sum())

number of feartures with NaN =  0
number of features with NaN =  0
number of y's with NaN =  0


In [147]:
# Is target imbalanced?
print( 'PCT of zeros in y_train :',  ( ( y_train == 0 ).sum() )/ y_train.shape )
print( 'PCT of ones in y_train :',  ( ( y_train == 1 ).sum() )/ y_train.shape )
print('shape of y_train: ', y_train.shape )

PCT of zeros in y_train : [ 0.96355248]
PCT of ones in y_train : [ 0.03644752]
shape of y_train:  (595212,)


In [148]:
# The data is highly imbalanced - We should try to balance the data
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT
# from sklearn.svm import LinearSVC
# from sklearn.model_selection import train_test_split

# from imblearn import over_sampling as os
# from imblearn import pipeline as pl
# from imblearn.metrics import (geometric_mean_score,
#                               make_index_balanced_accuracy)
# print(__doc__)

# RANDOM_STATE = 42

# # Generate the training set
# X, y = X_all[:y_train_raw.shape[0]], y_train_raw

# pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
#                             LinearSVC(random_state=RANDOM_STATE))

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     random_state=RANDOM_STATE)


# # Train the classifier with balancing
# pipeline.fit(X_train, y_train)


# # Test the classifier and get the prediction
# y_pred_bal = pipeline.predict(X_test)

# # LinearSVC on the original data
# clf = LinearSVC(random_state=RANDOM_STATE)
# clf.fit(X_train, y_train)
# y_pred_original = clf.predict(X_test)

# ##
# print( 'PCT of zeros in y_pred_bal :',  ( ( y_test == 0 ).sum() )/ y_test.shape )
# print( 'PCT of zeros in y_pred_bal :',  ( ( y_pred_bal == 0 ).sum() )/ y_pred_bal.shape )
# print( 'PCT of zeros in y_pred_original :',  ( ( y_pred_original == 0 ).sum() )/ y_pred_original.shape )

In [149]:
# # Train the classifier with balancing

# from sklearn import svm

# # tic = time()
# # pipeline = pl.make_pipeline(os.SMOTE(random_state=RANDOM_STATE),
# #                             svm.SVC(random_state=RANDOM_STATE))
# # pipeline.fit(X_train, y_train)
# # # Test the classifier and get the prediction
# # y_pred_bal = pipeline.predict(X_test)

# # toc = time()
# # print('time to smote, train, predict svc_rbf', toc - tic, 'sec.')

# # SVC on the original data
# tic = time()
# clf = svm.SVC(random_state=RANDOM_STATE)
# clf.fit(X_train, y_train)
# y_pred_original = clf.predict(X_test)
# toc = time()
# print('time to train, predict svc_rbf', toc - tic, 'sec.')
# ##
# print( 'PCT of zeros in y_pred_bal :',  ( ( y_test == 0 ).sum() )/ y_test.shape )
# print( 'PCT of zeros in y_pred_bal :',  ( ( y_pred_bal == 0 ).sum() )/ y_pred_bal.shape )
# print( 'PCT of zeros in y_pred_original :',  ( ( y_pred_original == 0 ).sum() )/ y_pred_original.shape ) 

In [150]:
# y_pred = clf.predict(X_test)
# plt.scatter(y_test, y_pred)
# plt.show

<function matplotlib.pyplot.show>

In [151]:
np.bincount(y)

array([573518,  21694])

In [152]:
print( 'PCT of zeros in y_test :',  ( ( y_test == 0 ).sum() )/ y_test.shape )
print( 'PCT of zeros in y_pred :',  ( ( y_pred == 0 ).sum() )/ y_pred.shape )

PCT of zeros in y_test : [ 0.96107527]
PCT of zeros in y_pred : [ 1.]


In [156]:
## Train so that False Positives are minimized
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
# Choose parameters
RS = 42  # random_state
TS = 0.5  # test_size

X, y = X_all[:y_train.shape[0]], y_train

# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TS, random_state=RS)

# Create a simple classifier
clf = svm.LinearSVC(class_weight="balanced", random_state=RS)

param_grid = {'C' : [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000],}
clf = GridSearchCV(clf, param_grid, scoring="recall")

clf.fit(X_train, y_train)
y_pred = clf.decision_function(X_test)

precision, recall, _ = precision_recall_curve(y_test, y_pred)

average_precision = average_precision_score(y_test, y_pred)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(average_precision))
plt.show()

In [161]:
clf.best_params_

{'C': 0.001}

In [162]:
clf.best_score_ 

0.4554307116104868

In [164]:
clf = svm.LinearSVC(class_weight="balanced", random_state=RS)
param_grid = {'C' : [0.00001, 0.00003, 0.0001, 0.0003, 0.001, 0.003],}
clf = GridSearchCV(clf, param_grid, scoring="recall")

clf.fit(X_train, y_train)
y_pred = clf.decision_function(X_test)

precision, recall, _ = precision_recall_curve(y_test, y_pred)

average_precision = average_precision_score(y_test, y_pred)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(average_precision))
plt.show()

In [165]:
clf.best_params_

{'C': 3e-05}

In [166]:
clf.best_score_

0.47415730337078654

In [167]:
clf = svm.LinearSVC(class_weight="balanced", random_state=RS)
param_grid = {'C' : [0.000001, 0.000003, 0.00001, 0.00003, 0.0001, 0.0003],}
clf = GridSearchCV(clf, param_grid, scoring="recall")

clf.fit(X_train, y_train)
y_pred = clf.decision_function(X_test)

precision, recall, _ = precision_recall_curve(y_test, y_pred)

average_precision = average_precision_score(y_test, y_pred)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(average_precision))
plt.show()

In [168]:
clf.best_params_

{'C': 1e-06}

In [169]:
clf.best_score_

0.48014981273408247

In [172]:
clf = svm.LinearSVC(class_weight="balanced", random_state=RS)
param_grid = {'C' : [1.0e-10, 1.0e-6],}
clf = GridSearchCV(clf, param_grid, scoring="recall")

clf.fit(X_train, y_train)
y_pred = clf.decision_function(X_test)

precision, recall, _ = precision_recall_curve(y_test, y_pred)

average_precision = average_precision_score(y_test, y_pred)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AUC={0:0.2f}'.format(average_precision))
plt.show()
print('C_best = ', clf.best_params_)
print('recall_best = ', clf.best_score_)

In [173]:
clf.best_params_

{'C': 1e-06}

In [174]:
clf.best_score_

0.48014981273408247

In [176]:
y_pred[:10] 

array([-0.03087269,  0.01394985, -0.01601431, -0.01118446,  0.017896  ,
        0.00387498, -0.0095496 , -0.01610037, -0.00520305, -0.04586   ])

In [177]:
y_pred[y_pred>0.5].any()

False

In [178]:
y_test[y_test>0.5].any()

True

In [179]:
from collections import Counter
print( Counter(X_all.dtypes.values) )

Counter({dtype('int64'): 47, dtype('float64'): 10})
