In [1]:
import re
import pandas as pd
import numpy as np

#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.utils import shuffle

import pickle

In [2]:
# sorts strings properly
def natural_sort(l):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
    return sorted(l, key = alphanum_key)

In [3]:
'''
data class has the dataframe, keeps track of its bin, cat, ord etc. col names and has summary of cont. and ord. cols
'''
class data:
    '''
    filename: path of csv file to be read
    '''
    def __init__(self, filename):
        self.df = pd.read_csv(filename)
        
        #don't use cols_ anywhere except in init
        self.cols_ = self.df.columns.tolist()
        
        self.float_cols = self.df.select_dtypes(include=['float64']).columns.tolist()
        #cat and bin column names
        self.cat_col_names = [col for col in self.cols_ if '_cat' in col]
        self.bin_col_names = [col for col in self.cols_ if '_bin' in col]

        self.ord_col_names = []
        for col in self.cols_:
            if ('_cat' in col) or ('_bin' in col) or (col in ['id', 'target']) or (col in self.float_cols):
                a = 1
            else:
                self.ord_col_names.append(col)
                
        # integer columns and float columns summary (mean, median, min, max) 
        self.column_summary = {}
        for col in self.float_cols+self.ord_col_names:
            t_d = {}
            t_d['median'] = self.df[col].dropna().median()
            t_d['mean'] = self.df[col].dropna().mean()
            t_d['max'] = self.df[col].dropna().max()
            t_d['min'] = self.df[col].dropna().min()
            t_d['range'] = t_d['max'] - t_d['min']
            self.column_summary[col] = t_d
    
    '''
    removes the column from dataframe and its name from appropriate list
    col_name_list_: is a list of columns to be removed
    '''            
    def remove_cols(self, col_name_list_):
        for col_name in col_name_list_:  
            self.df.drop([col_name],axis=1, inplace = True)
            if col_name in self.cat_col_names:
                self.cat_col_names.remove(col_name)
            elif col_name in self.float_cols:
                self.float_cols.remove(col_name)
            elif col_name in self.bin_col_names:
                self.bin_col_names.remove(col_name)
            elif col_name in self.ord_col_names:
                self.ord_col_names.remove(col_name)
                
    def sort_df(self):
        ordered_cols = natural_sort(self.df.columns.tolist())
        self.df = self.df[ordered_cols]
        
    def get_dummies(self):
        self.df = pd.get_dummies(self.df,columns=self.cat_col_names,prefix=self.cat_col_names)

In [5]:
#pipeline fn should be executed after creating train as it is used in that function
train = data('train.csv')

In [4]:
'''
temp_data: it should be a data object
'''
def pipeline(temp_data, dummy_var=False):
    drop_cols = ['ps_car_03_cat','ps_car_05_cat', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin','ps_ind_13_bin', 
                'ps_car_10_cat', 'ps_ind_14']
    temp_data.remove_cols(drop_cols)
    
    '''
    I'm treating ord cols as cont. variables for this problem.
    replace null values with median and makes the column mean 0 and every value bt -1 and +1
    '''
    for col in temp_data.float_cols + temp_data.ord_col_names:
        #replacing nan values with median for float and integer columns
        #here for this problem -1 means nan
        #for categorical variables we will make all the dummy variables zero
        
        temp_data.df[col].replace(-1, train.column_summary[col]['median'],inplace=True)
        temp_data.df[col] = (temp_data.df[col] - train.column_summary[col]['min'])/train.column_summary[col]['range']
    if dummy_var:
        temp_data.get_dummies()
    temp_data.sort_df()

In [6]:
pipeline(train, dummy_var=True)

In [7]:
df0 = train.df[train.df.target == 0]

df1 = train.df[train.df.target == 1]

In [10]:
x0 = df0.drop(['id','target'], axis=1).as_matrix()
y0 = df0['target'].as_matrix()

x1 = df1.drop(['id','target'], axis=1).as_matrix()
y1 = df1['target'].as_matrix()

In [11]:
x_train, x_temp0, y_train,y_temp0 = train_test_split(x0,y0, test_size = 0.1, random_state=42)

In [12]:
print(np.shape(x_train), np.shape(x_temp0),np.shape(y_temp0))

(516166, 213) (57352, 213) (57352,)


In [13]:
x_val0, x_test0, y_val0,y_test0 = train_test_split(x_temp0,y_temp0, test_size = 0.3, random_state=42)

In [14]:
x_val1, x_test1, y_val1,y_test1 = train_test_split(x1,y1, test_size = 0.3, random_state=42)

In [17]:
x_test = np.vstack((x_test0, x_test1))
y_test = np.hstack((y_test0, y_test1))

x_test, y_test = shuffle(x_test,y_test)

In [18]:
print(np.shape(x_train), np.shape(y_train),np.shape(x_val0), np.shape(y_val0),np.shape(x_val1), np.shape(y_val1),np.shape(x_test), np.shape(y_test))

(23715, 213) (23715,) (40146, 213) (40146,) (15185, 213) (15185,) (23715, 213) (23715,)


In [8]:
'''
#startified shuffle split will split data into train, test and validation with same ratio as
#target classes
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.4, random_state=0)
a = sss.split(x_vt, y_vt)
for train_index, test_index in a:
    x_val, x_test = x_vt[train_index], x_vt[test_index]
    y_val, y_test = y_vt[train_index], y_vt[test_index] 
'''

In [None]:
data = [(x_train,y_train),(x_val0,y_val0),(x_val1,y_val1),(x_test,y_test)]
with open('data.pickle', 'wb') as handle:
    pickle.dump(data, handle)