In [1]:
import pickle
import time
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from imblearn.over_sampling import SMOTE

from tensorflow.contrib.layers import flatten

In [2]:
df = pd.read_csv('train.csv')

In [None]:
#dropping columns that are dominated by the null values
df.drop(['ps_reg_03','ps_car_03_cat', 'ps_car_05_cat'],axis=1, inplace = True)

In [None]:
#float cols used later, kept here so operation won't change the dtype of cols
#update this if any float cols are deleted
float_cols = df.select_dtypes(include=['float64']).columns

print('Number of float columns: ', len(float_cols))

In [None]:
#storing rows with null values in sep. dataframe for later use
df_any_null = df[(df == -1).any(axis = 1)]
print('Number of rows with atleast one null value: ', len(df_any_null))

#df with no null values.
df = df[~(df == -1).any(axis = 1)]

In [None]:
#dropping binary columns that are dominated by a single level
df.drop(['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_13_bin'],axis=1, inplace = True)

In [None]:
#dropping categorical columns that are dominated by a single level
df.drop(['ps_car_10_cat'],axis=1, inplace = True)

In [None]:
#dropping a column that has correlation with another column, ps_ind_14
df.drop(['ps_ind_12_bin'],axis=1, inplace = True)

In [None]:
#cat. column names
col_names = [col for col in df.columns if '_cat' in col]

#creating dummy variables for categorical variables 
b = []
for col in col_names:
    if len(df[col].unique()) > 2:
        b.append(col)
#dummies, automatically drops parent column        
df = pd.get_dummies(df,columns=b,prefix=b)

del(b)

In [None]:
#normalizing the float columns
#dict to save the min, mean and max of a float column. to be used on test file
fcols_summ = {}
for col in float_cols:
    t_d = {}
    t_d['mean'] = df[col].mean()
    t_d['max'] = df[col].max()
    t_d['min'] = df[col].min()
    range_ = t_d['max'] - t_d['min']
    df[col] = (df[col] - t_d['mean'])/range_
    fcols_summ[col] = t_d

In [3]:
#rearranging cols for convolutions
ind_cols = []
reg_cols = []
car_cols = []
calc_cols = []
for col in df.columns:
    if 'ind' in col:
        ind_cols.append(col)
    if 'reg' in col:
        reg_cols.append(col)
    if 'car' in col:
        car_cols.append(col)
    if 'calc' in col:
        calc_cols.append(col)
        
cols_order = ind_cols+reg_cols+car_cols+calc_cols
print('No of Columns: ', len(cols_order))
df = df[['id', 'target'] + cols_order]

In [None]:
#splitting data into training and test sets
#shuffle for train_test_split by default is true
training_features, test_features, \
training_target, test_target, = train_test_split(df.drop(['id','target'], axis=1),
                                               df['target'],
                                               test_size = .2,
                                               random_state=12)

In [None]:
#splitting train data to train and validation sets
#shuffle for train_test_split by default is true
x_train_res, x_val, y_train_res, y_val = train_test_split(training_features, training_target,
                                                  test_size = .1,
                                                  random_state=12)

In [None]:
#SMOTE, as the dataset is imbalanced
sm = SMOTE(random_state=12, ratio = 'minority', kind='borderline1')
x_train, y_train = sm.fit_sample(x_train_res, y_train_res)

assert(len(y_train) == np.shape(x_train)[0])

print('After SMOTE x type: ', type(x_train))
print('After SMOTE x shape: ', np.shape(x_train))

In [None]:
'''
Smote is for continuous variables but as we have both cont. and categorical var. we should use SMOTE-NC as described in the 
following papper: https://www.jair.org/media/953/live-953-2037-jair.pdf

Alternatively, I chose to round off the categorical variables to the nearest integer.
'''
cols = df.drop(['id','target'], axis=1).columns

#cat_idx is list of index position of cols that are categorical
cat_idx = []
i = 0
for col in cols: 
    if ('_bin' in col) or ('_cat' in col):
        cat_idx.append(i)
    i = i + 1

#in numpy axis = 0 implies a column
x_train[:, cat_idx] = np.apply_along_axis(np.round, axis = 0, arr=x_train[:, cat_idx])

In [5]:
# Number of training examples
n_train = len(y_train)

# Number of validation examples
n_validation = len(y_val)

# Number of testing examples.
#n_test = len(y_test)

In [15]:
#model parameters
learning_rate = 0.001
epochs = 1
batch_size = 9000

In [8]:
x_val = x_val.as_matrix()
y_val = y_val.as_matrix()

In [8]:
print('type of trainging set',type(x_train))
print('Initial shape of training set: ',np.shape(x_train))

<class 'numpy.ndarray'>
(90, 200)


In [21]:
#reshaping the array to be given as input to 3D

x_train = x_train[:,np.newaxis,:,np.newaxis]
print('New shape of training set: ',np.shape(x_train))

x_val = x_val[:,np.newaxis,:,np.newaxis]

In [8]:
x = tf.placeholder(tf.float32, [None,1,200,1])
y_ = tf.placeholder(tf.int32, [None])
y = tf.one_hot(y_, 2)

l = tf.placeholder(tf.float32) #learning rate placeholder

In [3]:
def conv_(x, wts, bias, stride=1, padding='VALID'):
    x = tf.nn.conv2d(x, wts, [1,1,stride,1], padding)
    x = tf.nn.bias_add(x, bias)
    return tf.nn.relu(x)  

def NN_lay(x, wts, bias):
    x = tf.add(tf.matmul(x, wts), bias)
    return tf.nn.relu(x)

In [4]:
#first conv layer variables
cnw_1 = tf.Variable(tf.truncated_normal([1,3,1,8], mean=0, stddev=0.1)) #stride 1
cnb_1 = tf.Variable(tf.zeros([8]))

#second conv layer variables
cnw_2 = tf.Variable(tf.truncated_normal([1,3,8,16], mean=0, stddev=0.1)) #stride 2
cnb_2 = tf.Variable(tf.zeros([16]))

#third conv layer variables
cnw_3 = tf.Variable(tf.truncated_normal([1,3,16,24], mean=0, stddev=0.1)) #stride 3
cnb_3 = tf.Variable(tf.zeros([24]))

#NN wts and bias
nnwts_1 = tf.Variable(tf.truncated_normal([96, 128], mean=0, stddev=0.1))
nnb_1 = tf.Variable(tf.zeros([128]))

nnwts_2 = tf.Variable(tf.truncated_normal([128, 64], mean=0, stddev=0.1))
nnb_2 = tf.Variable(tf.zeros([64]))

nnwts_3 = tf.Variable(tf.truncated_normal([64, 32], mean=0, stddev=0.1))
nnb_3 = tf.Variable(tf.zeros([32]))

nnwts_4 = tf.Variable(tf.truncated_normal([32, 2], mean=0, stddev=0.1))
nnb_4 = tf.Variable(tf.zeros([2]))

In [5]:
logits = conv_(x, cnw_1, cnb_1)
#c1 = tf.shape(logits)
logits = tf.nn.max_pool(logits, [1,1,2,1], [1,1,2,1], 'VALID')
#m1 = tf.shape(logits)

logits = conv_(logits, cnw_2, cnb_2, stride=2,padding='VALID')
#c2 = tf.shape(logits)
#padding logits
logits = tf.pad(logits, tf.convert_to_tensor([[0,0],[0,0],[1,1],[0,0]]))
#after_pad = tf.shape(logits)
logits = tf.nn.max_pool(logits, [1,1,3,1], [1,1,3,1], 'VALID')
#m2 = tf.shape(logits)

logits = conv_(logits, cnw_3, cnb_3,stride=2,padding='VALID')
#c3 = tf.shape(logits)
logits = tf.nn.max_pool(logits, [1,1,2,1], [1,1,2,1], 'VALID')
#m3 = tf.shape(logits)

In [6]:
logits = flatten(logits)

logits = NN_lay(logits, nnwts_1, nnb_1)

logits = NN_lay(logits, nnwts_2, nnb_2)

logits = NN_lay(logits, nnwts_3, nnb_3)

logits = tf.add(tf.matmul(logits, nnwts_4), nnb_4)

In [9]:
#cross entropy loss is objective function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))

# Optimizer 
optimizer = tf.train.AdamOptimizer(learning_rate=l).minimize(cost)

In [10]:
rec = tf.metrics.recall(y, tf.round(tf.nn.softmax(logits)))
prec = tf.metrics.precision(y, tf.round(tf.nn.softmax(logits)))

In [11]:
init = tf. global_variables_initializer()

#saving the model
save_file = 'New_wts/model'
saver = tf.train.Saver()

In [25]:
with tf.Session() as sess:
    sess.run(init)
    
    no_of_batches = int(len(y_train)/batch_size)
    print('No of batches: ', no_of_batches)
    x_t, y_t = shuffle(x_train, y_train)
    for epoch in range(epochs):
        strt_tym = time.time()
        for offset in range(no_of_batches):
            idx = np.random.randint(0, high=len(y_train), size=batch_size)
            batch_x, batch_y = x_train[idx], y_train[idx]
            
            sess.run(optimizer, feed_dict={x:batch_x, y_:batch_y, l:learning_rate})
            loss = sess.run(cost, feed_dict={x:batch_x, y_:batch_y})
            
            if offset//10 == 0:
                print('------------------------------------------------')
                print('Offset No: ', offset)
                print('Precision on Validation Set: ', sess.run(prec, feed_dict={x: x_val, y_: y_val}))
                print('Recall on Validation Set: ', sess.run(rec, feed_dict={x: x_val, y_: y_val}))
    saver.save(sess, save_file)
    

In [21]:
t1= time.time()

In [22]:
t2= time.time()

In [23]:
t2-t1

6.719599962234497