# Outbrain advertisement click prediction
## 1. Extremely Randomized Trees Ensemble Method
## 2. KNN Method
by Ryan Goy, https://www.linkedin.com/in/goyryan

useful links:
 - [10 minutes to pandas](http://pandas.pydata.org/pandas-docs/version/0.15.2/10min.html)
 - [EDA graphs](https://www.kaggle.com/anokas/outbrain-click-prediction/outbrain-eda)
 - [sklearn page](http://scikit-learn.org/stable/modules/ensemble.html)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
import numpy as np
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import random
import time
import csv
%matplotlib inline

## Import and format datasets

In [2]:
clicks_train = pd.read_csv('../data/clicks_train.csv')
clicks_test = pd.read_csv('../data/clicks_test.csv')
print clicks_train.shape
print clicks_test.shape

(87141731, 3)
(32225162, 2)


In [3]:
events = pd.read_csv('../data/events.csv')
promoted_content = pd.read_csv('../data/promoted_content.csv')
print events.shape
print clicks_test.shape

  interactivity=interactivity, compiler=compiler, result=result)


(23120126, 6)
(32225162, 2)


In [4]:
def load_train_data():
    # join clicks_train, events, and promoted_content
    joiner = clicks_train.merge(events, on='display_id', how='inner')
    combined = joiner.merge(promoted_content, on='ad_id', how='inner', suffixes=('_from', '_to'))
    
    # load labels
    labels = combined['clicked'].as_matrix()
    one_hot = np.zeros((len(labels), 2))
    one_hot[np.arange(len(labels)), labels] = 1
    
    # drop by unnecessary columns for now
    combined = combined.drop('display_id', 1)
    combined = combined.drop('uuid', 1)
    combined = combined.drop('clicked', 1)
    
    # hash the geo location since it isnt an int
    combined['geo_location'] = combined['geo_location'].apply(lambda x: hash(x) % 10000)
    combined['platform'] = combined['platform'].apply(lambda x: 0. if x == '\\N' else int(x))
    
    normed = (combined - combined.mean()) / (combined.max() - combined.min())
    
    # turn dataframe into matrix
    ret_mat = normed.as_matrix()
    
    return combined.astype("float"), one_hot.astype("float")

def load_test_data():
    # same as load_train_data except theres no clicked row
    # join clicks_train, events, and promoted_content
    joiner = clicks_test.merge(events, on='display_id', how='inner')
    combined = joiner.merge(promoted_content, on='ad_id', how='inner', suffixes=('_from', '_to'))
    
    combined.sort_values(by="display_id", axis = 0, inplace = True)
    
    disp_ids = combined['display_id'].as_matrix()
    ad_ids = combined['ad_id'].as_matrix()
    # drop by unnecessary columns for now
    combined = combined.drop('display_id', 1)
    combined = combined.drop('uuid', 1)
    
    # hash the geo location since it isnt an int
    combined['geo_location'] = combined['geo_location'].apply(lambda x: hash(x) % 10000)
    combined['platform'] = combined['platform'].apply(lambda x: 0 if x == '\\N' else int(x))
    
    normed = (combined - combined.mean()) / (combined.max() - combined.min())
    
    # turn dataframe into matrix
    ret_mat = normed.as_matrix()
    
    return combined.astype("float"), disp_ids, ad_ids
    
def load_next_train_data(data, labels, batch_size):
    index = random.randint(1, len(clicks_train.index)-batch_size)
    return data[index:index+batch_size], labels[index:index+batch_size]

In [5]:
start = time.time()
data, labels = load_train_data()
end = time.time()
print 'seconds to load train data: ' + str(end-start)

seconds to load train data: 156.115435839


## 1. Processing using an extremely randomized tree

In [77]:
classifier = RandomForestClassifier(n_estimators=5, min_samples_split=1, n_jobs=6)

In [78]:
start = time.time()
classifier.fit(data[:10000], labels[:10000])
end = time.time()
print 'seconds to train: ' + str(end-start)

seconds to train: 0.152595996857


In [85]:
entry = data[50000000:50000001].as_matrix()
print entry.shape
print classifier.score(entry*10000000, labels[50000000:60000000])

(1, 8)


ValueError: Found input variables with inconsistent numbers of samples: [10000000, 1]

In [24]:

#print classifier.predict(data[50000000:50000100])

In [25]:
#print labels[:100]

## 2. KNN Classifier

In [64]:
knn = KNeighborsClassifier(n_neighbors = 10)

In [65]:
size = 100000
knn.fit(data[:size], labels[:size])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [66]:
begin = 500000
k= data[begin:begin+20].as_matrix()
print labels[begin:begin+20]
print knn.predict(k)


[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]
[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]
