In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [2]:
import tensorflow as tf

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
train = pd.read_csv('microdata_processed/train_train.csv')
test = pd.read_csv('microdata_processed/train_test_set.csv')
validation = pd.read_csv('microdata_processed/train_validation.csv')

In [5]:
train_full = pd.concat([train,test,validation])

In [6]:
train_full.shape

(51864, 195)

In [7]:
x_col = list(train.columns.values)

x_col.remove('Unnamed: 0')
x_col.remove('CASEID')
x_col.remove('morethan60kyr')

train_y=train['morethan60kyr']
test_y = test['morethan60kyr']

train_x = train[x_col]
test_x = test[x_col]
n_train = train_x.shape[0]

In [8]:
rf = RandomForestClassifier(n_estimators=30, n_jobs=-1,min_samples_split=5,max_features=50)
rf.fit(train_x,train_y)
rf.score(train_x,train_y)
#rf.score(test_x,test_y)
importance = rf.feature_importances_

imp_arg = np.argsort(importance)
imp_arg=imp_arg[::-1]

top_k = 100
imp_arg[:top_k]

array([  1,   3,   0, 161,   2, 158,  44,  45,  42, 179,  40,  99,  79,
        27, 127, 149,  85, 125,  35,  46,  22, 175, 124, 148, 169, 140,
        53,  69, 147, 131, 162, 151, 121, 114,  54,  58,  57, 115,  47,
        61, 119, 141,  17, 142,  48, 145, 123, 177,  64,  28, 156, 118,
        72, 116,  29,  71,  60, 187,  62, 120,  63, 181,  83, 122,  37,
        39,  15, 184,  51,  68, 154,  23,  86, 165,  50, 143,  65,   5,
        91,  55, 182,  74, 135, 112, 174, 163,  78, 146,   9,   6,  75,
       153, 191,  30,  59,  18, 129, 138,  76,  34])

In [9]:
train_x = train_x[imp_arg]

In [None]:
#test_x = test_x[imp_arg]

In [None]:
# normailize
# x_min =  np.min(train_x,axis=0)
# x_range = np.max(train_x,axis=0)-x_min
# x_range = np.max(test_x,axis=0)-np.min(test_x,axis=0)
# train_x = (train_x)/x_range
# test_x = (test_x)/x_range

In [10]:
class FFNN(object):
    def __init__(self,layer_size,n_feat, n_class):
        self.x = tf.placeholder(tf.float32, shape = (None,n_feat), name = 'x')
        self.y = tf.placeholder(tf.float32, shape = (None,n_class), name = 'y')
        self.keep_prob = tf.placeholder(tf.float32,name = 'keep_prob')
        

        def layer (input_size, output_size, input_x, layer_num):
             with tf.name_scope('hidden-layer_{}'.format(layer_num)):
                W= tf.get_variable(
                    "W_{}".format(layer_num),
                    shape = [input_size,output_size],
                    initializer = tf.contrib.layers.xavier_initializer())
                b= tf.Variable(tf.zeros([output_size]),name='b_{}'.format(layer_num))
                h= tf.nn.relu(tf.nn.xw_plus_b(input_x,W,b, name ='h_{}'.format(layer_num)))
                h_drop = tf.nn.dropout(h, self.keep_prob)
                return W,b,h_drop 
        
        W,b,h = layer(n_feat, layer_size[0],self.x,1)
        
        for i in range(1,len(layer_size)-1):
            W,b,h = layer(layer_size[i],layer_size[i+1],h,i+2)
            
        
        with tf.name_scope('output'):
            W= tf.get_variable(
                "output/W",
                shape = [layer_size[-1],n_class],
                initializer = tf.contrib.layers.xavier_initializer())
            b= tf.Variable(tf.constant(0.1, shape =[n_class], name ="output/b"))
            scores =tf.nn.xw_plus_b(h,W,b, name ="scores")   
            self.prediction=tf.argmax(scores, 1, name="predictions")
            self.probability = tf.nn.softmax(scores)
            
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits(scores,self.y)
            self.loss = tf.reduce_mean(losses)
    
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.prediction, tf.argmax(self.y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")


In [11]:
sess= tf.InteractiveSession()

In [12]:
nn = FFNN([50], train_x.shape[1],2)

In [13]:
global_step = tf.Variable(0, name="global_step", trainable=False)
learning_rate=tf.train.exponential_decay(0.001, global_step,7000, 0.96, staircase=True)
optimizer = tf.train.AdamOptimizer(learning_rate)
grads_and_vars = optimizer.compute_gradients(nn.loss)
train_step= optimizer.apply_gradients(grads_and_vars, global_step= global_step)

In [14]:
sess.run(tf.initialize_all_variables())

In [15]:
batch_size = 100 
train_steps = np.floor(n_train/batch_size)*50

In [16]:
step = 0

In [17]:
def to_one_hot(y):
    yy =np.zeros((y.shape[0],2))
    yy[np.arange(y.shape[0]),y]=1
    return yy

In [18]:
def test_acc_all():
    test_yy = to_one_hot(test_y)
    feed_dict = {nn.x:test_x,nn.y:test_yy, nn.keep_prob:1}
    return nn.accuracy.eval(feed_dict = feed_dict)

In [19]:
while step <train_steps:
    for i in range(0,n_train,batch_size):
        batch_x = train_x[i:i+batch_size]
        batch_y = to_one_hot(train_y[i:i+batch_size])
        
        train_step.run(session=sess, feed_dict={nn.x: batch_x, nn.y: batch_y, nn.keep_prob: 0.5})
        
        if step%1000==0 and step!=0:
            feed_dict={nn.x: batch_x, nn.y: batch_y, nn.keep_prob: 1}
            train_loss,train_acc= sess.run([nn.loss,nn.accuracy], feed_dict= feed_dict)
            print 'step:'+str(step)+ ", train loss: "+str(train_loss)+ ", train accuracy: "+str(train_acc)#+ ' test acc '+ str(test_acc_all())
        step +=1
    train_x, train_y = shuffle(train_x, train_y)

step:1000, train loss: 0.619397, train accuracy: 0.69
step:2000, train loss: 0.560679, train accuracy: 0.74
step:3000, train loss: 0.419442, train accuracy: 0.85
step:4000, train loss: 0.493257, train accuracy: 0.78
step:5000, train loss: 0.590729, train accuracy: 0.75
step:6000, train loss: 0.470538, train accuracy: 0.77
step:7000, train loss: 0.528445, train accuracy: 0.81
step:8000, train loss: 0.526179, train accuracy: 0.77
step:9000, train loss: 0.477775, train accuracy: 0.76
step:10000, train loss: 0.519046, train accuracy: 0.77
step:11000, train loss: 0.455281, train accuracy: 0.8
step:12000, train loss: 0.452798, train accuracy: 0.79
step:13000, train loss: 0.429788, train accuracy: 0.76
step:14000, train loss: 0.443374, train accuracy: 0.77
step:15000, train loss: 0.585689, train accuracy: 0.73
step:16000, train loss: 0.445786, train accuracy: 0.81
step:17000, train loss: 0.473191, train accuracy: 0.78
step:18000, train loss: 0.47985, train accuracy: 0.82
step:19000, train los

In [None]:
saver = tf.train.Saver()

In [None]:
saver.save(sess, 'nn_h_50', global_step=step)

In [None]:
#check the validation set 
validation = pd.read_csv('microdata_processed/train_validation.csv')

In [None]:
validation_y=train['morethan60kyr']
validation_x = train[x_col]

In [None]:
validation_x=validation_x[imp_arg]
validation_yy = to_one_hot(validation_y)
feed_dict = {nn.x:validation_x,nn.y:validation_yy, nn.keep_prob:1}
nn.accuracy.eval(feed_dict = feed_dict)

In [None]:
test_acc_all()

In [22]:
test=pd.read_csv('microdata_processed/test.csv')

In [29]:
test = test.set_index('CASEID')

In [47]:
test_col=list(test.columns)

In [50]:
test['cma_Hamilton'] = np.zeros(len_test)

In [51]:
test['cma_Halifax'] = np.zeros(len_test)

In [39]:
len_test=test.shape[0]

In [58]:
test_y = nn.prediction.eval(feed_dict ={nn.x:test[x_col][imp_arg],nn.keep_prob:1})

In [64]:
test['morethan60kyr']=test_y

In [68]:
preds =test['morethan60kyr']

In [71]:
pred = preds.to_frame()

In [72]:
pred.to_csv('test_prediction.csv')

In [76]:
pred['morethan60kyr'].astype('bool').to_csv('test_prediction.csv')