
# Neural Network implementation

In [12]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import train_test_split
import pandas as pd
import time

In [13]:
start_read = time.time()
df = pd.read_csv("./feature_engineered_data.csv", delimiter='\t')
print(time.time() - start_read)
df

9.672500133514404


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ip,app,device,os,channel,click_time,attributed_time,is_attributed,...,num_clicks_per_app,attribution_ratio_ip,attribution_ratio_ip_os_device,attribution_ratio_app,attribution_ratio_ip_app,attribution_ratio_app_channel,num_clicks_in_hour_for_ip,num_clicks_in_hour_for_ip_os_device,num_clicks_in_hour_for_ip_app_os_device,num_clicks_in_hour_for_ip_app
0,6227,75117456,115621,14,1,13,379,2017-11-08 01:52:06,,0,...,27241,0.363636,0.125000,0.091957,0.000000,0.081726,3,1,0,0
1,769481,127039611,161439,3,1,13,280,2017-11-08 19:33:44,2017-11-09 07:37:29,1,...,94283,0.212121,0.250000,0.108832,0.300000,0.084639,3,1,1,2
2,456257,179969293,50113,9,1,20,107,2017-11-09 14:25:53,,0,...,59518,0.375000,0.000000,0.316257,0.333333,0.315878,2,0,0,1
3,187574,160317072,359830,3,1,19,130,2017-11-09 08:24:28,,0,...,94283,0.400000,0.333333,0.108832,0.000000,0.098657,2,2,1,1
4,300849,17146659,64394,2,1,14,258,2017-11-07 02:17:32,,0,...,58783,0.562500,0.000000,0.096303,0.000000,0.084325,4,0,0,2
5,433011,77198428,173212,3,1,19,115,2017-11-08 02:29:47,,0,...,94283,0.166667,0.000000,0.108832,0.000000,0.048675,2,1,2,2
6,642794,75704051,231891,10,1,30,113,2017-11-08 02:03:57,2017-11-08 02:04:30,1,...,36245,1.000000,1.000000,0.954007,1.000000,0.987323,0,0,0,0
7,417764,39430779,49602,15,1,19,245,2017-11-07 09:28:03,,0,...,42897,0.320233,0.379808,0.079679,0.055556,0.044109,15,7,1,2
8,756726,119478198,62379,19,0,24,347,2017-11-08 15:11:11,2017-11-08 15:11:59,1,...,134316,0.370370,1.000000,0.985661,1.000000,0.965812,2,1,1,1
9,711357,99301340,31498,19,0,29,213,2017-11-08 09:24:12,2017-11-08 09:36:01,1,...,134316,0.588235,1.000000,0.985661,1.000000,0.989568,3,2,2,2


In [18]:

scaler = StandardScaler()  

#label is is_attributed
#fv's are ip, app, device, os, channel, click_time_dt (as a long), hour of day, and day of week
def get_fv_labels(df):
    labels = df['is_attributed']
    fv = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'attributed_time', 'time_to_attribution','is_attributed', 'click_time'], axis=1)
    return fv, labels

def get_data(df, k):
    # shuffle the data
    df = df.sample(frac=1)

    # split by day of week
    train = df[df["day_of_week"] != k]
    test = df[df["day_of_week"] == k]
    
    print("training/test split:" + str(len(train)*1.0/len(df)))
    
    train_fv, train_labels = get_fv_labels(train)
    test_fv, test_labels = get_fv_labels(test)

    # scale feature vectors: fit only on training data
    scaler.fit(train_fv)
    train_fv_sc = scaler.transform(train_fv)
    test_fv_sc = scaler.transform(test_fv)

    return train_fv_sc, train_labels, test_fv_sc, test_labels

In [21]:
results_acc = {}
results_f1 = {}

def sorted_dict_order(results):
    print("\nsorted results: parameters from worst to best")
    for config in sorted(results):
        print(config)
        print(results.get(config))

for first_layer in [120]:
    # [40, 80, 120]
    for second_layer in [10]:
        # [10,20, 40]
        for a in [1e-1, 1e-3, 1e-5, 1e-7, 1e-9]:
            #[1e-1, 1e-3, 1e-5, 1e-7, 1e-9]
            accuracies = []
            for k in [1,2,3]:
                print("\n\n\nNeural network")
                print("first layer: " + str(first_layer))
                print("second layer: " + str(second_layer))
                print("alpha: " + str(a))
                train_x, train_y, test_x, test_y = get_data(df, k)
                start = time.time()
                clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(40,10), random_state=1)
                clf.fit(train_x, train_y)
                print(time.time() - start)

                train_accuracy = clf.score(train_x, train_y)
                print(train_accuracy)

                test_accuracy = clf.score(test_x, test_y)
                print(test_accuracy)
                accuracies.append(test_accuracy)

                predict = clf.predict(test_x)

                #print(confusion_matrix(test_y, predict))
                prfs_result = precision_recall_fscore_support(test_y, predict, average='binary')
                print(prfs_result)

            validation_acc = sum(accuracies)/len(accuracies)
            results_acc[validation_acc] = { "train_accuracy" : train_accuracy, 
                                      "validation_accuracy": validation_acc, 
                                      "layer 1" : first_layer, 
                                      "layer 2": second_layer, 
                                      "alpha" : a, 
                                      "precision" : prfs_result[0],
                                      "recall" : prfs_result[1],
                                      "f1score" : prfs_result[2]
                                     }

            sorted_dict_order(results_acc)





Neural network
first layer: 120
second layer: 10
alpha: 0.1
training/test split:0.672125836715217
375.7314112186432
0.9794061708211478
0.9796912970331402
(0.9865570625383037, 0.9732788307352593, 0.979872965462485, None)



Neural network
first layer: 120
second layer: 10
alpha: 0.1
training/test split:0.6621126156297746
312.87235403060913
0.9792864734770657
0.9801862498987772
(0.982927405385544, 0.9770742429282355, 0.9799920844931295, None)



Neural network
first layer: 120
second layer: 10
alpha: 0.1
training/test split:0.709948210118946
350.4238085746765
0.9806358818142857
0.9765072561109057
(0.9867844431403333, 0.9664693543932461, 0.9765212538087912, None)

sorted results: parameters from worst to best
0.9787949343476078
{'train_accuracy': 0.9806358818142857, 'validation_accuracy': 0.9787949343476078, 'layer 1': 120, 'layer 2': 10, 'alpha': 0.1, 'precision': 0.9867844431403333, 'recall': 0.9664693543932461, 'f1score': 0.9765212538087912}



Neural network
first layer: 120
second

In [16]:
print("sorted results: parameters from worst to best")
for config in sorted(results_acc):
    print(config)
    print(results_acc.get(config))

sorted results: parameters from worst to best
0.9795204510929285


NameError: name 'd' is not defined