# Mini-Data Set Preparation

After the Kaggle Script "Making a mini-data set" is run (FYI, it takes about 2 minutes to run) to reduce the size of the data to 40,000 instances, run this script to organize data into a single dataframe. 

Run this with the 8 csv files produced by the Kaggle Script in the same directory. 

Note: This is a Python3 script because that is what Kaggle uses. 

Things we've discovered:
* Multiple ad_id per display_id
* Multiple display_id per document_id
* Ad_id can be in multiple display_id and multiple document_id
* Only one ad_id per display_id is clicked

In [1]:
import pandas as pd
import numpy as np
import copy
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [69]:
clicks_train_og = pd.read_csv("../input/clicks_train.csv")
promoted_content_og = pd.read_csv("../input/promoted_content.csv")
doc_cats_og = pd.read_csv("../input/documents_categories.csv")
doc_ents_og = pd.read_csv("../input/documents_entities.csv")
doc_meta_og = pd.read_csv("../input/documents_meta.csv")
doc_topics_og = pd.read_csv("../input/documents_topics.csv")
events_og = pd.read_csv("../input/events.csv")
page_views_og = pd.read_csv("../input/page_views_sample.csv")
clicks_test_og = pd.read_csv("../input/clicks_test.csv")

In [70]:
# FOR TESTING WITH MINI DATASET

doc_ids = set(page_views_og['document_id']) & set(promoted_content_og['document_id'])
# pull in the content that is in both page_views and promoted_content

events = events_og[events_og['document_id'].isin(doc_ids)]
clicks_train = clicks_train_og[clicks_train_og['display_id'].isin(events['display_id'])]
clicks_test = clicks_test_og[clicks_test_og['display_id'].isin(events['display_id'])]

events = events[events['display_id'].isin(clicks_train['display_id'])]

promoted_content = promoted_content_og[promoted_content_og['ad_id'].isin(clicks_train['ad_id'])]
doc_cats = doc_cats_og[doc_cats_og['document_id'].isin(promoted_content['document_id'])]
doc_ents = doc_ents_og[doc_ents_og['document_id'].isin(promoted_content['document_id'])]
doc_meta = doc_meta_og[doc_meta_og['document_id'].isin(promoted_content['document_id'])]
doc_topics = doc_topics_og[doc_topics_og['document_id'].isin(promoted_content['document_id'])]
page_views = page_views_og[page_views_og['document_id'].isin(events['document_id'])]

In [None]:
# # FOR FULL DATASET ON AWS

# display_sample = np.random.choice(clicks_train_og["display_id"].unique(), 10000) # change this if too many rows
# clicks_train = clicks_train_og[clicks_train_og["display_id"].isin(display_sample)]
# # select 4000 random display id's and grab all rows in click_train with that display
# # every display has multiple ads and only 1 ad in every display is clicked
# promoted_content = promoted_content_og[promoted_content_og["ad_id"].isin(clicks_train["ad_id"])]
# # same ad can show up in multiple displays, so length of unique ads < length of unique displays
# doc_cats = doc_cats_og[doc_cats_og["document_id"].isin(promoted_content["document_id"])]
# doc_ents = doc_ents_og[doc_ents_og["document_id"].isin(promoted_content["document_id"])]
# doc_meta = doc_meta_og[doc_meta_og["document_id"].isin(promoted_content["document_id"])]
# doc_topics = doc_topics_og[doc_topics_og["document_id"].isin(promoted_content["document_id"])]
# events = events_og[events_og["display_id"].isin(clicks_train_og["display_id"])]
# page_views = page_views_og[page_views_og["document_id"].isin(promoted_content["document_id"])]
# # platform & traffic source need to be either all integers or all strings (right now its mixed)


### Below here is my scratch work

In [6]:
# Cleanup cells
events['platform'] = map(str, events['platform'])
page_views['platform'] = map(str, page_views['platform'])
page_views['traffic_source'] = map(str, page_views['traffic_source'])

# events['country'], events['state'] = zip(*map(lambda x: str(x).split('>'), list(events['geo_location'])))

# #temp = map(lambda x: str(x).split('>'), list(page_views['geo_location']))
# print temp[:5]
# zip(*temp[:5]) # removes DMA

In [20]:
#events2 = pd.DataFrame(events['geo_location'].str.split(',').tolist(), columns = ['country', 'state', 'dma'])


In [None]:
events['geo_location'].str.split('>', expand= True)

In [124]:
geo = map(lambda x: str(x).split('>'), events['geo_location'])
zip(*geo)
country = [x[0] for x in geo]
#state = [x[1] for x in geo if x[1]]
if None:
    print('a')

## Make master data merging all features to clicks_train

### Merge information about the displays to master dataset
Events are only if the user CLICKED. This dataset will bring in information about the display_id's from events

In [6]:
data = clicks_train.merge(events, on='display_id', how='left')
data.head()

# joins information about the display that the user saw
# each display has a unique user id, doc id, and timestamp
# events has the information about the display (who the user is, which site (document_id) it was on, when it was seen, from where, etc.)

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location
0,37,70153,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819
1,37,149047,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819
2,37,169564,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819
3,37,234713,1,d4f62cdcb39ad8,1779285,2687,2,US>WA>819
4,37,235443,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819


Now we are merging information on what documents the ads referred to (from source: promoted_content).  
In every display, there are multiple ads (within one document = document_id_x). Every ad refers to a different document, which is the site the ad is promoting (document_id_y). All the columns after document_id_y are information about that document (to which the ad is referring).

In [7]:
data = data.merge(promoted_content, on='ad_id', how='left')
data.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id_x,timestamp,platform,geo_location,document_id_y,campaign_id,advertiser_id
0,37,70153,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,933716,7516,1438
1,37,149047,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1169985,16636,380
2,37,169564,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1394819,20109,640
3,37,234713,1,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1586431,245,232
4,37,235443,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1377696,11654,555


### Merge information about the documents the ads refer to
All the doc files have information about the documents (websites) to which the ads refer to
including confidence levels of which topics the ads referred to, which categories they're apart of, etc.

We wanted to duplicate the idea of the CountVectorizer for the 'bag of words' model we used for spam detection, but since we're not counting words in a text, it's a little bit different. Since we have a 'dictionary' of categories and topics, we use that as our 'vocabulary.' Every document has a confidence level for one or more items in the vocabulary, so we create a sparse matrix with every topic and category as columns, and every document has a confidence level value in the respective columns. If they are not given a confidence level, we put 0 because the document most likely does not have anything to do with that category or topic (given the data provided by Outbrain).

This data on the documents will help us separate ads from one another based on topic/category.  
ie) why did ad A get clicked instead of ad B? We know ad A referred to document 1 whereas ad B referred to document 2, and now we have general information about the documents the ads referred to. We will merge this information in later steps.

In [8]:
sparsetop = doc_topics.pivot(index='document_id', columns='topic_id', values='confidence_level')
sparsetop.columns = ['top_' + str(col) for col in sparsetop.columns]

sparsecat = doc_cats.pivot(index='document_id', columns='category_id', values='confidence_level')
sparsecat.columns = ['cat_' + str(col) for col in sparsecat.columns]

sparse = sparsetop.join(sparsecat, how='outer')
sparse.fillna(0, inplace=True)

sparse.reset_index(level=0, inplace=True)
sparse.head()

Unnamed: 0,document_id,top_0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,...,cat_1913,cat_1914,cat_1915,cat_2000,cat_2002,cat_2003,cat_2004,cat_2005,cat_2006,cat_2100
0,5720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
print(len(sparse['document_id'].unique()), len(data['document_id_y'].unique()))
data = data.merge(sparse, left_on='document_id_y', right_on='document_id', how='left')
data.head()

(15545, 15562)


Unnamed: 0,display_id,ad_id,clicked,uuid,document_id_x,timestamp,platform,geo_location,document_id_y,campaign_id,...,cat_1913,cat_1914,cat_1915,cat_2000,cat_2002,cat_2003,cat_2004,cat_2005,cat_2006,cat_2100
0,37,70153,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,933716,7516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,37,149047,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1169985,16636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,37,169564,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1394819,20109,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,37,234713,1,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1586431,245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37,235443,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1377696,11654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


creating dictionaries for % of ads clicked for every advertiser and campaign.  
purpose: merge to master dataset as a feature for every ad, how often the advertiser and campaign are successful on average.

In [10]:
advr_success = dict(zip(data.advertiser_id.unique(), [sum(data[data['advertiser_id']==x]['clicked'])/len(data[data['advertiser_id']==x]) for x in data['advertiser_id'].unique()]))
camp_success = dict(zip(data.campaign_id.unique(), [sum(data[data['campaign_id']==x]['clicked'])/len(data[data['campaign_id']==x]) for x in data['campaign_id'].unique()]))

data['campaign_perc'] = data['campaign_id'].map(camp_success)
data['advertiser_perc'] = data['advertiser_id'].map(advr_success)

data.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id_x,timestamp,platform,geo_location,document_id_y,campaign_id,...,cat_1915,cat_2000,cat_2002,cat_2003,cat_2004,cat_2005,cat_2006,cat_2100,campaign_perc,advertiser_perc
0,37,70153,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,933716,7516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,37,149047,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1169985,16636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,37,169564,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1394819,20109,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,37,234713,1,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1586431,245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,37,235443,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1377696,11654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


Add count of page views to every document that an ad appears in (document_id_x) as a feature, could tell us something about likelihood of ads being clicked

In [12]:
doc_view_freq = dict(zip(page_views.document_id.unique(), [len(page_views[page_views.document_id==x]) for x in page_views.document_id.unique()]))
data['docx_view_freq'] = data['document_id_x'].map(doc_view_freq)
data.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id_x,timestamp,platform,geo_location,document_id_y,campaign_id,...,cat_2000,cat_2002,cat_2003,cat_2004,cat_2005,cat_2006,cat_2100,campaign_perc,advertiser_perc,docx_view_freq
0,37,70153,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,933716,7516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,7701
1,37,149047,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1169985,16636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,7701
2,37,169564,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1394819,20109,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,7701
3,37,234713,1,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1586431,245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,7701
4,37,235443,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1377696,11654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,7701


# Make training and test sets

In [27]:
# labels = data['clicked']
# labels = labels.values.reshape(-1,1) # check this please! my python is 3.5 and told me to use values.reshape
# del data['clicked']
print 'Labels length:', len(labels)
print 'data length:', data.shape

Labels length: 282205
data length: (282205, 409)


In [39]:
train_data = data[:int(.7*len(data))]
test_data = data[int(.7*len(data)):]

train_labels = labels[:int(.7*len(data))]
test_labels = labels[int(.7*len(data)):]

print 'training label shape:', train_labels.shape
print 'training data shape:', train_data.shape
print 'test label shape:', test_labels.shape
print 'test data shape:', test_data.shape

training label shape: (197543L, 1L)
training data shape: (197543, 409)
test label shape: (84662L, 1L)
test data shape: (84662, 409)


# Following Homework2

In [None]:
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [68]:
def lr_prediction():
    '''Returns the array of ad_id and probability it will be clicked'''
    lr = LogisticRegression()
    lr.fit(train_data[train_data.columns[11:]].fillna(0), train_labels)
    lr_click_prob = lr.predict_proba(test_data[test_data.columns[11:]].fillna(0))[:,1]
    lr_score = lr.score(test_data[test_data.columns[11:]].fillna(0), test_labels)

84662