### Dummy Classifier

In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import dateutil
import sklearn
from sklearn import svm
from sklearn import preprocessing
import timeit

In [67]:
# get data in basic table form
df = pd.read_csv("./train_sample.csv")

In [68]:
df['time_to_attribution'] = df.apply(lambda row: datetime.strptime(row['attributed_time'], '%Y-%m-%d %H:%M:%S') - datetime.strptime(row['click_time'], '%Y-%m-%d %H:%M:%S') if isinstance(row['attributed_time'], str) else '', axis=1)

In [69]:
df['click_time_dt'] = df.apply(lambda row: (dateutil.parser.parse(row['click_time'], dayfirst=True) - datetime.utcfromtimestamp(0)).total_seconds() , axis=1)

ip = df.groupby('ip')['ip']
min_time = df.groupby('ip')["click_time_dt"].min()
max_time = df.groupby('ip')["click_time_dt"].max()
num_clicks = df.groupby('ip')["click_time_dt"].count()

freq = num_clicks / (max_time - min_time)
import math
f_list = []
for f in freq:
    if not math.isinf(f):
        f_list.append(f)
freq2 = sum(f_list)/len(f_list)
period = 1/freq2
arr = np.array([f_list])
stdev = np.std(arr)
outliers = arr[(arr - np.mean(arr)) > 2 * np.std(arr)]
np.max(arr)

2.0

In [70]:
def get_fv_labels(df):
    labels = df['is_attributed']
    fv = df.drop(['click_time', 'attributed_time', 'time_to_attribution','is_attributed'], axis=1)
    return fv, labels

def scale_fv(mat):
    min_max_scaler = preprocessing.MinMaxScaler()
    return np.transpose(min_max_scaler.fit_transform(np.transpose(mat)))
df = df.sample(frac=1)
df_train = df.head(60000)
df_test = df.tail(40000)

train_fv, train_labels = get_fv_labels(df_train)
test_fv, test_labels = get_fv_labels(df_test)

train_fv_np = train_fv.as_matrix()
test_fv_np = test_fv.as_matrix()

train_labels_np = train_labels.as_matrix()
test_labels_np = test_labels.as_matrix()

train_fv_np_sc = scale_fv(train_fv_np)
test_fv_np_sc = scale_fv(test_fv_np)

In [71]:
#probability of finding a 1 from lables
prob_1 = np.count_nonzero(train_labels_np)/len(train_labels)
prob_1

0.00225

In [72]:
from sklearn.dummy import DummyClassifier

def dummy_classifier(fv, labels):
    clf = sklearn.dummy.DummyClassifier(strategy='stratified', random_state=None)
    clf.fit(fv, labels) 
    return clf

import time
start = time.time()

clf = dummy_classifier(train_fv_np_sc,train_labels_np) 

end = time.time()
print('time',end - start)

#print(timeit.timeit(svm(test_mat_scaled)))
predict = clf.predict(test_fv_np_sc)
labels = test_labels_np



print('score',clf.score(test_fv_np_sc,labels))

print('predict',predict)
print('labels',labels)

print('sum',sum(labels))


incorrect = np.sum(np.bitwise_xor(predict,labels))


print('incorrect',incorrect)

print('incorrect_ratio',(incorrect/40000))

print('dummy_prediction_result',(1-(incorrect/40000)))



time 0.0017650127410888672
score 0.995125
predict [0 0 0 ... 0 0 0]
labels [0 0 0 ... 0 0 0]
sum 92
incorrect 185
incorrect_ratio 0.004625
dummy_prediction_result 0.995375


### Logistic Regression

In [73]:
def lr(fv, labels):
    # change solver to sag or saga for large dataset, and include max_iter = 100
    clf = sklearn.linear_model.LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1, solver='liblinear')
    clf.fit(fv, labels) 
    return clf

import time
start = time.time()

clf = lr(train_fv_np_sc,train_labels_np) 

end = time.time()
print('time',end - start)

#print(timeit.timeit(svm(test_mat_scaled)))
predict = clf.predict(test_fv_np_sc)
labels = test_labels_np



print('score',clf.score(test_fv_np_sc,labels))

print('predict',predict)
print('labels',labels)

print('sum',sum(labels))


incorrect = np.sum(np.bitwise_xor(predict,labels))


print('incorrect',incorrect)

print('incorrect_ratio',(incorrect/40000))

print('lr_prediction_result',(1-(incorrect/40000)))


time 0.04658102989196777
score 0.9977
predict [0 0 0 ... 0 0 0]
labels [0 0 0 ... 0 0 0]
sum 92
incorrect 92
incorrect_ratio 0.0023
lr_prediction_result 0.9977


### Multiclass Decision Tree

In [74]:
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

In [75]:
#let's have a look at the data
#how many features do we have?
#how many categories within the features?
#can we get use this info to get a Gini score and start the CART tree?
print('There are', len(df["ip"].unique()), 'unique IPs.')
print('There are', len(df["app"].unique()), 'unique Apps.')
print('There are', len(df["device"].unique()), 'unique Devices.')
print('There are', len(df["os"].unique()), 'unique OSs.')
print('There are', len(df["channel"].unique()), 'unique Channels.')

There are 34857 unique IPs.
There are 161 unique Apps.
There are 100 unique Devices.
There are 130 unique OSs.
There are 161 unique Channels.


In [76]:
"""
def MultiVarDT(fv, labels):
    # change solver to sag or saga for large dataset, and include max_iter = 100
    clf = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)
    clf.fit(fv, labels) 
    return clf

import time
start = time.time()

clf = MultiVarDT(train_fv_np_sc,train_labels_np) 

end = time.time()
print('time',end - start)

#print(timeit.timeit(svm(test_mat_scaled)))
predict = clf.predict(test_fv_np_sc)
labels = test_labels_np



print('score',clf.score(test_fv_np_sc,labels))

print('predict',predict)
print('labels',labels)

print('number of 1s',sum(labels))


incorrect = np.sum(np.bitwise_xor(predict,labels))


print('incorrect',incorrect)

print('incorrect_ratio',(incorrect/40000))

print('MultiClassDT_prediction_result',(1-(incorrect/40000)))
"""

'\ndef MultiVarDT(fv, labels):\n    # change solver to sag or saga for large dataset, and include max_iter = 100\n    clf = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=3, min_samples_leaf=5)\n    clf.fit(fv, labels) \n    return clf\n\nimport time\nstart = time.time()\n\nclf = MultiVarDT(train_fv_np_sc,train_labels_np) \n\nend = time.time()\nprint(\'time\',end - start)\n\n#print(timeit.timeit(svm(test_mat_scaled)))\npredict = clf.predict(test_fv_np_sc)\nlabels = test_labels_np\n\n\n\nprint(\'score\',clf.score(test_fv_np_sc,labels))\n\nprint(\'predict\',predict)\nprint(\'labels\',labels)\n\nprint(\'number of 1s\',sum(labels))\n\n\nincorrect = np.sum(np.bitwise_xor(predict,labels))\n\n\nprint(\'incorrect\',incorrect)\n\nprint(\'incorrect_ratio\',(incorrect/40000))\n\nprint(\'MultiClassDT_prediction_result\',(1-(incorrect/40000)))\n'

In [82]:
def MultiClassDecisionTree(fv,labels)
    for fv

train_fv_np_sc,train_labels_np

(60000, 60000)