In [1]:
import re
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('train.csv')

In [3]:
#dropping columns that are dominated by the null values
df.drop(['ps_reg_03','ps_car_03_cat', 'ps_car_05_cat'],axis=1, inplace = True)

In [4]:
#float cols used later, kept here so operation won't change the dtype of cols
#update this if any float cols are deleted
int_cols = df.select_dtypes(include=['int64']).columns
float_cols = df.select_dtypes(include=['float64']).columns
float_cols = float_cols.tolist()

print('Number of float columns: ', len(float_cols))

Number of float columns:  9


In [5]:
#storing rows with null values in sep. dataframe for later use
df_any_null = df[(df == -1).any(axis = 1)]
print('Number of rows with atleast one null value: ', len(df_any_null))

#df with no null values.
df = df[~(df == -1).any(axis = 1)]

Number of rows with atleast one null value:  53352


In [6]:
#dropping binary columns that are dominated by a single level
df.drop(['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin','ps_ind_13_bin'],axis=1, inplace = True)

In [7]:
#dropping categorical columns that are dominated by a single level
df.drop(['ps_car_10_cat', 'ps_car_11_cat'],axis=1, inplace = True)

In [8]:
#dropping ordinal columns that are dominated by a single level
df.drop(['ps_ind_14'],axis=1, inplace = True)

In [9]:
#final cat, bin and ord column names
cat_col_names = [col for col in df.columns if '_cat' in col]
bin_col_names = [col for col in df.columns if '_bin' in col]

ord_col_names = []
for col in df.columns:
    if ('_cat' in col) or ('_bin' in col) or (col in ['id', 'target']) or (col in float_cols):
        a = 1
    else:
        ord_col_names.append(col)
        
print(len(ord_col_names))

15


In [10]:
#creating dummy variables for categorical variables 
b = []
for col in cat_col_names:
    if len(df[col].unique()) > 2:
        b.append(col)
#dummies, automatically drops parent column        
df = pd.get_dummies(df,columns=b,prefix=b)

del(b)

###### Treating the ordinal columns as cont columns

In [12]:
#normalizing the float columns
#dict to save the min, mean and max of a float column. to be used on test file
cont_cols_summ = {}
for col in float_cols+ord_col_names:
    t_d = {}
    t_d['mean'] = df[col].mean()
    t_d['max'] = df[col].max()
    t_d['min'] = df[col].min()
    range_ = t_d['max'] - t_d['min']
    df[col] = (df[col] - t_d['mean'])/range_
    cont_cols_summ[col] = t_d

#### Reorder the columns

In [32]:
#ordering/sorting the columns accordingly
final_cols = df.drop(['id', 'target'], axis = 1).columns.tolist()

In [33]:
#this function sorts properly str 10 will come after str 9
def natural_sort(l): 
    convert = lambda text: int(text) if text.isdigit() else text.lower() 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

final_cols = natural_sort(final_cols)

In [29]:
df = df[['id', 'target']+final_cols]

In [36]:
# checking to see if column values are in range -1 to +1
for col in df.columns:
    if df[col].max() > 1 or df[col].min() < -1:
        print(col)

id


#### UnderSampling

In [41]:
#splitting data into training and test sets
#shuffle for train_test_split by default is true
training_features, x_test, training_target, y_test = train_test_split(df.drop(['id','target'], axis=1).as_matrix(),
                                               df['target'].as_matrix(),
                                               test_size = .2,
                                               random_state=12)

In [42]:
#splitting train data to train and validation sets
#shuffle for train_test_split by default is true
x_train, x_val, y_train, y_val = train_test_split(training_features, training_target,
                                                  test_size = .1,
                                                  random_state=12)

In [44]:
print(np.shape(x_train),np.shape(x_val),np.shape(x_test))

(390139, 97) (43349, 97) (108372, 97)


In [45]:
from collections import Counter
print(Counter(y_train),Counter(y_val),Counter(y_test))

Counter({0: 376305, 1: 13834}) Counter({0: 41794, 1: 1555}) Counter({0: 104611, 1: 3761})


In [None]:
with open('us_data.pickle', mode = 'wb') as f:
    pickle.dump([(x_train,y_train),(x_val,y_val),(x_test,y_test), final_cols, cont_cols_summ], f)