In [100]:
import pandas as pd
import os
from time import time
from IPython.display import display # Allows the use of display() for DataFrames
import numpy as np
# Import supplementary visualization code visuals.py
#import visuals as vs
import tensorflow as tf
# TODO: Import two metrics from sklearn - fbeta_score and accuracy_score
from sklearn.metrics import fbeta_score, accuracy_score
# Pretty display for notebooks
%matplotlib inline

 


In [5]:
path_to_data= "/home/ubuntu/udacity/CodeGladiator/invesco/data"

train_file = "training_transaction_data.csv"
test_file = "test_transaction.csv"

In [6]:
train_df = pd.read_csv(os.path.join(path_to_data, train_file))
test_df = pd.read_csv(os.path.join(path_to_data, test_file))

In [10]:
column_list= ['AUM_investor_log','Counts_investor','Shares_investor_log','AUM_advisor_log','Shares_advisor_log','Rating','1 Yr % Rank','3 Yr % Rank','1 Yr Return','3 Yr Return','Net Flows','Morningstar_Category_Rating','Transaction_Type']


In [11]:
required_train_df = train_df.filter(column_list)
required_test_df = test_df.filter(column_list)

In [21]:
required_test_df['Rating'] = required_test_df['Rating'].astype(float)

In [22]:
required_test_df=required_test_df.fillna(required_test_df.median())

In [24]:
transaction_type = required_train_df['Transaction_Type']
features_raw = required_train_df.drop('Transaction_Type', axis = 1)
test_raw = required_test_df

In [51]:
transaction_type.head()

0    P
1    P
2    P
3    P
4    P
Name: Transaction_Type, dtype: object

In [25]:
from sklearn.preprocessing import LabelEncoder
var_mod = column_list
var_mod.remove('Transaction_Type')
le = LabelEncoder()
for i in var_mod:
    print(i)
    features_raw[i] = le.fit_transform(features_raw[i])
    test_raw[i] = le.fit_transform(test_raw[i])

AUM_investor_log
Counts_investor
Shares_investor_log
AUM_advisor_log
Shares_advisor_log
Rating
1 Yr % Rank
3 Yr % Rank
1 Yr Return
3 Yr Return
Net Flows
Morningstar_Category_Rating


In [52]:
transaction_type = transaction_type.replace(['P','R'],[0,1])

In [104]:



train_x= features_raw.as_matrix()
test_x = test_raw.as_matrix()

In [105]:
split_size = int(train_x.shape[0]*0.8)

train_x, val_x = train_x[:split_size], train_x[split_size:]
train_y, val_y = transaction_type.values[:split_size], transaction_type.values[split_size:]

### Training on neural net

In [106]:
#helper functions
def dense_to_one_hot(labels_dense, num_classes=2):
    """Convert class labels from scalars to one-hot vectors"""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    
    return labels_one_hot

def preproc(unclean_batch_x):
    """Convert values to range 0-1"""
    temp_batch = unclean_batch_x / unclean_batch_x.max()
    
    return temp_batch

def batch_creator(batch_size, dataset_length, dataset_name):
    """Create batch with random samples and return appropriate format"""
    batch_mask = rng.choice(dataset_length, batch_size)
    
    batch_x = eval(dataset_name + '_x')[[batch_mask]] #.reshape(-1, input_num_units)
    batch_x = preproc(batch_x)
    
    #if dataset_name == 'transaction_type':
    batch_y = eval('transaction_type').ix[batch_mask].values
    batch_y = dense_to_one_hot(batch_y)
        
    return batch_x, batch_y

In [116]:
# To stop potential randomness
seed = 128
rng = np.random.RandomState(seed)
### set all variables

# number of neurons in each layer
input_num_units = 12
hidden_num_units = 25
output_num_units = 2

# define placeholders
x = tf.placeholder(tf.float32, [None, input_num_units])
y = tf.placeholder(tf.float32, [None, output_num_units])

# set remaining variables
epochs = 6
batch_size = 300
learning_rate = 0.1

### define weights and biases of the neural network

weights = {
    'hidden': tf.Variable(tf.truncated_normal([input_num_units, hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.zeros([hidden_num_units, output_num_units]))
}

biases = {
    'hidden': tf.Variable(tf.truncated_normal([hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.zeros([output_num_units]))
}

hidden_layer = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden_layer = tf.nn.relu(hidden_layer)

output_layer = tf.matmul(hidden_layer, weights['output']) + biases['output']

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output_layer, y))

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

init = tf.global_variables_initializer()



In [131]:
with tf.Session() as sess:
    # create initialized variables
    sess.run(init)
    
    ### for each epoch, do:
    ###   for each batch, do:
    ###     create pre-processed batch
    ###     run optimizer by feeding batch
    ###     find cost and reiterate to minimize
    
    for epoch in range(epochs):
        avg_cost = 0
        total_batch = int(features_raw.shape[0]/batch_size)
        for i in range(total_batch):
            batch_x, batch_y = batch_creator(batch_size, train_x.shape[0], 'train')
            _, c = sess.run([optimizer, cost], feed_dict = {x: batch_x, y: batch_y})
            
            avg_cost += c / total_batch
            
        print("Epoch:", (epoch+1), "cost =", "{:.5f}".format(avg_cost))
    
    print ("\nTraining complete!")
    
    
    # find predictions on val set
    pred_temp = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(pred_temp, "float"))
    #print ("Validation Accuracy:", accuracy.eval({x: val_x.reshape(-1, input_num_units), y: dense_to_one_hot(val_y)}))
    print ("Validation Accuracy:", accuracy.eval({x: val_x.reshape(-1, input_num_units), y: dense_to_one_hot(val_y)}))
    
    predict = tf.argmax(output_layer, 1)
    #pred = predict.eval({x: test_x.reshape(-1, input_num_units)})
    pred = predict.eval({x: train_x})
    
    print ("F-score on training data: {:.4f}".format(fbeta_score(train_y, pred, beta = 0.5)))
    pred = predict.eval({x: val_x})
    print ("F-score on testing data: {:.4f}".format(fbeta_score(val_y, pred, beta = 0.5)))

    
    

Epoch: 1 cost = 0.66109
Epoch: 2 cost = 0.64543
Epoch: 3 cost = 0.64164
Epoch: 4 cost = 0.64107
Epoch: 5 cost = 0.63679
Epoch: 6 cost = 0.64083
Epoch: 7 cost = 0.63403
Epoch: 8 cost = 0.63510

Training complete!
Validation Accuracy: 0.633002
F-score on training data: 0.6808
F-score on testing data: 0.6844


In [129]:
# set remaining variables
epochs = 8
batch_size = 300
learning_rate = 0.1

array([1, 1, 1, ..., 1, 1, 1])