In [1]:
import pandas as pd
import numpy as np

#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('train.csv')

In [5]:
#dropping columns that are dominated by the null values
df.drop(['ps_reg_03','ps_car_03_cat', 'ps_car_05_cat'],axis=1, inplace = True)

In [6]:
#float cols used later, kept here so operation won't change the dtype of cols
#update this if any float cols are deleted
float_cols = df.select_dtypes(include=['float64']).columns

print('Number of float columns: ', len(float_cols))

Number of float columns:  9


In [7]:
#storing rows with null values in sep. dataframe for later use
df_any_null = df[(df == -1).any(axis = 1)]
print('Number of rows with atleast one null value: ', len(df_any_null))

#df with no null values.
df = df[~(df == -1).any(axis = 1)]

Number of rows with atleast one null value:  53352


In [8]:
#dropping binary columns that are dominated by a single level
df.drop(['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_13_bin'],axis=1, inplace = True)

In [9]:
#dropping categorical columns that are dominated by a single level
df.drop(['ps_car_10_cat'],axis=1, inplace = True)

In [10]:
#dropping a column that has correlation with another column, ps_ind_14
df.drop(['ps_ind_12_bin'],axis=1, inplace = True)

In [11]:
#column names
col_names = [col for col in df.columns if '_cat' in col]

#creating dummy variables for categorical variables 
b = []
for col in col_names:
    if len(df[col].unique()) > 2:
        b.append(col)
#dummies, automatically drops parent column        
df = pd.get_dummies(df,columns=b,prefix=b)

del(b)

In [12]:
#normalizing the float columns
#dict to save the min, mean and max of a float column. to be used on test file
fcols_summ = {}
for col in float_cols:
    t_d = {}
    t_d['mean'] = df[col].mean()
    t_d['max'] = df[col].max()
    t_d['min'] = df[col].min()
    range_ = t_d['max'] - t_d['min']
    df[col] = (df[col] - t_d['mean'])/range_
    fcols_summ[col] = t_d

In [13]:
len(df[df.target==1])

19150

In [14]:
len(df_any_null[df_any_null.target==1])

2544

In [15]:
f

Unnamed: 0,id,target,ps_ind_01,ps_ind_03,ps_ind_04_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_14,...,ps_car_11_cat_95,ps_car_11_cat_96,ps_car_11_cat_97,ps_car_11_cat_98,ps_car_11_cat_99,ps_car_11_cat_100,ps_car_11_cat_101,ps_car_11_cat_102,ps_car_11_cat_103,ps_car_11_cat_104
0,7,0,2,5,1,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9,0,1,7,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13,0,5,9,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,0,0,2,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,17,0,0,0,1,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### Should rows with null be added back?

In [3]:
#splitting data into training and test sets
#shuffle for train_test_split by default is true
training_features, test_features, \
training_target, test_target, = train_test_split(df.drop(['id','target'], axis=1),
                                               df['target'],
                                               test_size = .2,
                                               random_state=12)

In [4]:
#splitting train data to train and validation sets
#shuffle for train_test_split by default is true
x_train, x_val, y_train, y_val = train_test_split(training_features, training_target,
                                                  test_size = .1,
                                                  random_state=12)

In [5]:
#SMOTE, as the dataset is imbalanced
sm = SMOTE(random_state=12, ratio = 'minority', kind='borderline1')
x_train_res, y_train_res = sm.fit_sample(x_train, y_train)

assert(len(y_train_res) == np.shape(x_train_res)[0])

print('After SMOTE x type: ', type(x_train_res))
print('After SMOTE x shape: ', np.shape(x_train_res))

After SMOTE x type:  <class 'numpy.ndarray'>
After SMOTE x shaoe:  (72, 200)


In [6]:
'''
Smote is for continuous variables but as we have both cont. and categorical var. we should use SMOTE-NC as described in the 
following papper: https://www.jair.org/media/953/live-953-2037-jair.pdf

Alternatively, I chose to round off the categorical variables to the nearest integer.
'''
cols = df.drop(['id','target'], axis=1).columns

#cat_idx is list of index position of cols that are categorical
cat_idx = []
i = 0
for col in cols: 
    if ('_bin' in col) or ('_cat' in col):
        cat_idx.append(i)
    i = i + 1

#in numpy axis = 0 implies a column
x_train_res[:, cat_idx] = np.apply_along_axis(np.round, axis = 0, arr=x_train_res[:, cat_idx])