In [12]:
## In this notebook, we fit a penalised ridge regression using a few 

## Packages

import os
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
# import preprocessing from sklearn
from sklearn import preprocessing
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

In [13]:
## Directories and paths

# Set directories
print(os.getcwd())
dirRawData = "../input/"
dirPData   = "../PData/"
dirPOutput = "../POutput/"

/home/jovyan/Projects/Fraud/PCode


In [14]:
## Functions

f_name = dirPData + '01_df_250k.pickle'

with (open(f_name, "rb")) as f:
    dict_ = pickle.load(f)

df_train = dict_['df_train']
df_test  = dict_['df_test']

del f_name, dict_

f_name = dirPData + '01_vars.pickle'

with open(f_name, "rb") as f:
    dict_ = pickle.load(f)

vars_ind_numeric     = dict_['vars_ind_numeric']
vars_ind_hccv        = dict_['vars_ind_hccv']
vars_ind_categorical = dict_['vars_ind_categorical']
vars_notToUse        = dict_['vars_notToUse']
var_dep              = dict_['var_dep']

del f_name, dict_

In [15]:
## Data Preperation

In [16]:
# define y variable for train set
y_train=df_train[var_dep]

In [17]:
# selecting categorical and numeric features to use and storing variable name in corresponding lists 
vars_ind_use=['f27','f29', 'e04','e02','b04']
vars_ind_cat_use=['f27','f29','b04']
vars_ind_numeric_use=['e02','e04']

In [18]:
# selecting variables we want from train set
df_train_use=df_train[vars_ind_use]

# selecting variables we want from test set
df_test_use=df_test[vars_ind_use]

In [19]:
## One-hot categorical variables for the train set

In [20]:
# loop to onehot all categorical variables we want to use for training
# since categorical variables aren't automatically one-hotted in sklearn we do this manually

vars_ind_onehot = []

df_all_onehot_train = df_train_use.copy()

for col in vars_ind_cat_use:
    print(col)
    
    # use pd.get_dummies 
    df_oh = pd.get_dummies(df_train[col], drop_first=False)
    
    # Find the column name of the most frequent category
    col_mostFreq = df_oh.sum(axis = 0).idxmax()
    
    # Drop the column of the most frequent category
    df_oh = df_oh.drop(col_mostFreq, axis=1)
        
    # Rename the columns to have the original variable name as a prefix
    oh_names = col + '_' + df_oh.columns.astype(str)
    df_oh.columns = oh_names
    
    df_all_onehot_train = pd.concat([df_all_onehot_train, df_oh], axis = 1, sort = False)

    del df_all_onehot_train[col]
    vars_ind_onehot.extend(oh_names)

f27
f29
b04


In [22]:
## One-hot categorical variables for the test set

# loop to onehot all categorical variables we want to use for test set
# since categorical variables aren't automatically one-hotted in sklearn we do this manually

vars_ind_onehot_test = []
df_all_onehot_test = df_test_use.copy()

for col in vars_ind_cat_use:
    print(col)
    
    # use pd.get_dummies on  df_all[col] 
    df_oh = pd.get_dummies(df_test[col], drop_first=False)
    
    # Find the column name of the most frequent category
    col_mostFreq = df_oh.sum(axis = 0).idxmax()
    
    # Drop the column of the most frequent category
    df_oh = df_oh.drop(col_mostFreq, axis=1)
        
    # Rename the columns to have the original variable name as a prefix
    oh_names = col + '_' + df_oh.columns.astype(str)
    df_oh.columns = oh_names
    
    df_all_onehot_test = pd.concat([df_all_onehot_test, df_oh], axis = 1, sort = False)

    del df_all_onehot_test[col]
    vars_ind_onehot_test.extend(oh_names)

f27
f29
b04


In [None]:
## Splines

In [23]:
# adding splines to the numeric variables we randomly chose
# some numeric variables may have non-linear effects so we spline them
vars_ind_tospline = df_train_use[vars_ind_numeric_use].columns.tolist()

# set variables to spline with unique values greater than 0 and because we want to spline all numeric variables
vars_ind_tospline = df_train_use[vars_ind_numeric_use].columns[(df_train_use[vars_ind_numeric_use].nunique() > 0)].tolist()

In [24]:
# defining the spline function
def fn_tosplines(x):
    # removing zeros to avoid issues where lots of values are zero
    x_nonzero = x[x != 0]
    ptiles = np.percentile(x_nonzero, [10, 20, 40, 60, 80, 90]) # choosing the percentile split
    ptiles = np.unique(ptiles)
    print(var, ptiles)
    df_ptiles = pd.DataFrame({var: x}) # converting it to a dataframe
    for idx, ptile in enumerate(ptiles):
        df_ptiles[var + '_' + str(idx)] = np.maximum(0, x - ptiles[idx])
    return(df_ptiles)

In [25]:
# applying the function on the selected variables and concatinating it on the training set
for var in vars_ind_tospline:
    df_ptiles = fn_tosplines(df_train[var])
    df_all_onehot_train.drop(columns=[var], inplace=True)
    vars_ind_numeric_use.remove(var)
    df_all_onehot_train = pd.concat([df_all_onehot_train, df_ptiles], axis=1, sort=False)
    vars_ind_numeric_use.extend(df_ptiles.columns.tolist())

e02 [11. 20. 41. 60. 80. 89.]
e04 [12. 18. 39. 60. 81. 89.]


In [26]:
# applying the function on the selected variables and concatinating it on the test set
for var in vars_ind_tospline:
    df_ptiles = fn_tosplines(df_test[var])
    df_all_onehot_test.drop(columns=[var], inplace=True)
    vars_ind_numeric_use.remove(var)
    df_all_onehot_test = pd.concat([df_all_onehot_test, df_ptiles], axis=1, sort=False)
    vars_ind_numeric_use.extend(df_ptiles.columns.tolist())

e02 [13. 23. 41. 60. 81. 90.]
e04 [ 7. 18. 34. 54. 76. 87.]


In [27]:
## Updating Independent Variables after OneHot and splines

In [28]:
vars_ind = df_all_onehot_train.columns.to_list()
vars_ind

['f27_A',
 'f27_B',
 'f27_C',
 'f27_D',
 'f27_E',
 'f27_X',
 'f29_A',
 'f29_B',
 'f29_C',
 'f29_D',
 'f29_F',
 'f29_G',
 'b04_B',
 'b04_C',
 'b04_D',
 'b04_E',
 'b04_F',
 'b04_G',
 'b04_H',
 'b04_I',
 'e02',
 'e02_0',
 'e02_1',
 'e02_2',
 'e02_3',
 'e02_4',
 'e02_5',
 'e04',
 'e04_0',
 'e04_1',
 'e04_2',
 'e04_3',
 'e04_4',
 'e04_5']

In [30]:
## Ridge regression without setting the optimal lambda

# Column 'f29_A' was removed because it did not appear in the onehot encoding of the test set
df_all_onehot_train=df_all_onehot_train.drop('f29_A',axis=1)

# Defining the dependent and independent variables
X_train = df_all_onehot_train
y_train = y_train

X_train

Unnamed: 0,f27_A,f27_B,f27_C,f27_D,f27_E,f27_X,f29_B,f29_C,f29_D,f29_F,...,e02_3,e02_4,e02_5,e04,e04_0,e04_1,e04_2,e04_3,e04_4,e04_5
0,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,44,32.0,26.0,5.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,44,32.0,26.0,5.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,44,32.0,26.0,5.0,0.0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,...,34.0,14.0,5.0,65,53.0,47.0,26.0,5.0,0.0,0.0
4,0,0,0,0,0,0,0,0,1,0,...,34.0,14.0,5.0,65,53.0,47.0,26.0,5.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,0,0,0,0,0,0,0,0,1,0,...,10.0,0.0,0.0,12,0.0,0.0,0.0,0.0,0.0,0.0
249996,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0
249997,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0
249998,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
%%time
# Fitting the model using 10-fold Cross-Validation with default parameters 
# To see performance of basic model

ridgeCV_ = RidgeCV(fit_intercept=True
                   ,normalize=True # normalising the data
                   ,cv=10)

ridgeCV_.fit(X=X_train, y=y_train)

CPU times: user 10.4 s, sys: 8.92 s, total: 19.3 s
Wall time: 6.34 s


RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=10, normalize=True)

In [None]:
ridgeCV_.coef_ # model coefficients

In [None]:
ridgeCV_.predict(df_all_onehot_train) # model train predictions

In [None]:
# this model 

ridgeCV_.intercept_ # model intercept

In [None]:
### Ridge regression

#We fit a grid for different values of lambda ranging from 0-1 at a step of 0.01.  For each penalised model we fit, we will calculate and store the Mean Absolute (prediction) Error on the train data.

In [33]:
# find optimal lambda using grid

# import libraries and modules for grid and repeatedKfold
from numpy import arange
from pandas import read_csv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge

In [34]:
model = Ridge()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
# define search
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X=X_train, y=y_train)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -0.493
Config: {'alpha': 0.43}


In [None]:
results.best_score_ # best mae found is 0.493 with and an alpha of 0.43

In [35]:
predictions=results.predict(df_all_onehot_test) #use model to predict results on test set

In [36]:
predictions.min() # some predictions negative due to nature of linear regression

-0.028127490578694236

In [37]:
# we clip predictions so that they are betweeen 0 and 1 as they respresent probabilities
clipped = np.clip(predictions, 0, 1)
clipped.max()

0.7558342527076346

In [None]:
df_test

In [39]:
df_sub_ridge= pd.DataFrame() # create empty df

df_sub_ridge['unique_id']=df_test["unique_id"] # add unique id from test set

df_sub_ridge['Predicted']=pd.DataFrame(clipped) # add predictions 

df_sub_ridge.to_csv('../POutput/df_sub_ridge1.csv', index=False) # covert df to csv for kaggle submission