# Mini-Data Set Preparation

After the Kaggle Script "Making a mini-data set" is run (FYI, it takes about 2 minutes to run) to reduce the size of the data to 40,000 instances, run this script to organize data into a single dataframe. 

Run this with the 8 csv files produced by the Kaggle Script in the same directory. 

Note: This is a Python3 script because that is what Kaggle uses. 

In [1]:
import pandas as pd
import numpy as np
import copy
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [13]:
page_views_sample = pd.read_csv("../input/page_views_sample.csv")
page_views_sample = page_views_sample.sample(500000, random_state = 0)
# 1 million sample page views
print(len(page_views_sample['document_id'].unique()))

promoted = pd.read_csv("../input/promoted_content.csv")
promoted = promoted[promoted["document_id"].isin(page_views_sample["document_id"])]
# find advertisements in the documents in the sample page views

print(len(promoted['document_id'].unique()))
print(len(promoted['ad_id'].unique()))
# there's only 580 documents that are in the smaller sample of page views
# theres 3,804 unique ads from promoted_content that are in the page views sample

clicks_train = pd.read_csv("../input/clicks_train.csv")
clicks_train = clicks_train[clicks_train['ad_id'].isin(promoted['ad_id'])]
print(len(clicks_train['ad_id'].unique()))
# filter only the clicks associated with the ads/documents in our page_views_sample

doc_cats = pd.read_csv("../input/documents_categories.csv")
doc_cats = doc_cats[doc_cats["document_id"].isin(promoted["document_id"])]
print(len(doc_cats['document_id'].unique()))

doc_ents = pd.read_csv("../input/documents_entities.csv")
doc_ents = doc_ents[doc_ents["document_id"].isin(promoted["document_id"])]
print(len(doc_ents['document_id'].unique()))

doc_meta = pd.read_csv("../input/documents_meta.csv")
doc_meta = doc_meta[doc_meta["document_id"].isin(promoted["document_id"])]
print(len(doc_meta['document_id'].unique()))

doc_topics = pd.read_csv("../input/documents_topics.csv")
doc_topics = doc_topics[doc_topics["document_id"].isin(promoted["document_id"])]
print(len(doc_topics['document_id'].unique()))

events = pd.read_csv("../input/events.csv")
events = events[events["display_id"].isin(clicks_train["display_id"])]
print(len(events['display_id'].unique()))



3595
3239


  interactivity=interactivity, compiler=compiler, result=result)


298197


## Join clicks_train and events on display_id

In [40]:
data = clicks_train.merge(events, on='display_id')

print(len(data))
print(data.head())


300788
   display_id   ad_id  clicked            uuid  document_id  timestamp  \
0          16  273567        1  30c0ad12b36375      1727882       1033   
1          44  269739        1  7804739ae4f351      1747897       3157   
2         132  125384        0  69f8e7151d7204      1789548      10079   
3         136  150813        0  2028b0a2ad323e       225790      10443   
4         170  125375        0  b2778a037a571e      1773530      12840   

  platform geo_location  
0        1    US>FL>561  
1        2    US>CA>825  
2        2    US>FL>534  
3        1    US>CO>751  
4        1    US>TN>640  


## Promoted

In [41]:
len(promoted)

3595

In [42]:
#there is not a one-to-one relationship between document_id in promoted and the master data
#This is because the same ad is being shown in different documents I think
print(len(promoted["document_id"].unique()))
print(len(data["document_id"].unique()))

440
69033


In [44]:
#there is a one-to-one relationship between ad_id in promoted and the master data
print(len(promoted["ad_id"].unique())) #each add can appear more than once
print(len(data["ad_id"].unique()))
print(promoted.head())

3595
3239
      ad_id  document_id  campaign_id  advertiser_id
554     556        89577           37             63
560     562        90859           37             63
618     621       129575           37             63
935     938       154717           37             63
2131   2149       269511           37             63


In [45]:
promoted.head()
del promoted['document_id'] #I think all we want from here is the link between ad_id and campaign id
promoted.head()

Unnamed: 0,ad_id,campaign_id,advertiser_id
554,556,37,63
560,562,37,63
618,621,37,63
935,938,37,63
2131,2149,37,63


## Joining Info about each ad

I make a dictionary of the advertiser and campaign id for each ad_id, map that dictionary to the ad id to make the advertizer and campain columns

In [46]:
data.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location
0,16,273567,1,30c0ad12b36375,1727882,1033,1,US>FL>561
1,44,269739,1,7804739ae4f351,1747897,3157,2,US>CA>825
2,132,125384,0,69f8e7151d7204,1789548,10079,2,US>FL>534
3,136,150813,0,2028b0a2ad323e,225790,10443,1,US>CO>751
4,170,125375,0,b2778a037a571e,1773530,12840,1,US>TN>640


In [47]:
print(len(data))
print(len(data["ad_id"].unique())) #adds appear on average slightly more than twice in our minidata set

300788
3239


In [48]:
#make dictionaries to look up advertizer id and campaign id for each ad_id
advertiser_dict = dict(zip(promoted.ad_id, promoted.advertiser_id))
campaign_dict = dict(zip(promoted.ad_id, promoted.campaign_id))


In [53]:
data["campaign_id"] = data["ad_id"].map(campaign_dict)
data["advertiser_id"] = data["ad_id"].map(advertiser_dict)
data.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id
0,16,273567,1,30c0ad12b36375,1727882,1033,1,US>FL>561,25885,112
1,44,269739,1,7804739ae4f351,1747897,3157,2,US>CA>825,6395,177
2,132,125384,0,69f8e7151d7204,1789548,10079,2,US>FL>534,16062,3001
3,136,150813,0,2028b0a2ad323e,225790,10443,1,US>CO>751,19188,2407
4,170,125375,0,b2778a037a571e,1773530,12840,1,US>TN>640,16061,3001


In [54]:
print(len(data))
print(len(data["ad_id"].unique())) #adds appear on average slightly more than twice in our minidata set

300788
3239


## Importing Document Information

I'm super stuck on why all the document ids that appear in our data arent in the files with more information about each documents.

In [55]:
#Why aren't there the same number of unique documents in each of these
print(len(data["document_id"].unique()))
print(len(doc_cats["document_id"].unique()))
print(len(doc_ents["document_id"].unique()))
print(len(doc_meta["document_id"].unique()))
print(len(doc_topics["document_id"].unique()))

69033
440
360
440
440


In [56]:
#each document has multiple possible entities, categories, topics with different confidence level. 
#maybe we should just for now keep the most likely entity, topic and category? 
doc_ents.head()

Unnamed: 0,document_id,entity_id,confidence_level
96193,1028468,31ac3cc32333d4e2d683582cc9a0f178,0.938636
96194,1028468,255e3f1965267728e2d1881dc48a492f,0.846911
96195,1028468,20eab4b758f91327bd00bcbe1a94399e,0.282334
96367,996578,a6c3be9e0b97cac5eaffee76b84c6904,0.807709
96368,996578,255e3f1965267728e2d1881dc48a492f,0.683944


In [57]:
doc_cats.head()

Unnamed: 0,document_id,category_id,confidence_level
69729,1028468,2004,0.92
69730,1028468,1904,0.07
69815,996578,2004,0.92
69816,996578,1703,0.07
71593,23920,2004,0.92


In [58]:
data.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id
0,16,273567,1,30c0ad12b36375,1727882,1033,1,US>FL>561,25885,112
1,44,269739,1,7804739ae4f351,1747897,3157,2,US>CA>825,6395,177
2,132,125384,0,69f8e7151d7204,1789548,10079,2,US>FL>534,16062,3001
3,136,150813,0,2028b0a2ad323e,225790,10443,1,US>CO>751,19188,2407
4,170,125375,0,b2778a037a571e,1773530,12840,1,US>TN>640,16061,3001


In [59]:
data["geo_location"] = data["geo_location"].apply(str)

In [66]:
print(data.head())


   display_id   ad_id  clicked            uuid  document_id  timestamp  \
0          16  273567        1  30c0ad12b36375      1727882       1033   
1          44  269739        1  7804739ae4f351      1747897       3157   
2         132  125384        0  69f8e7151d7204      1789548      10079   
3         136  150813        0  2028b0a2ad323e       225790      10443   
4         170  125375        0  b2778a037a571e      1773530      12840   

  platform geo_location  campaign_id  advertiser_id    ...    state_Y9  \
0        1    US>FL>561        25885            112    ...         0.0   
1        2    US>CA>825         6395            177    ...         0.0   
2        2    US>FL>534        16062           3001    ...         0.0   
3        1    US>CO>751        19188           2407    ...         0.0   
4        1    US>TN>640        16061           3001    ...         0.0   

  state_YT state_Z1  state_Z2  state_Z3  state_Z4  state_Z5  state_Z6  \
0      0.0      0.0       0.0       0

In [61]:
data["country"] = [x[:2] for x in data["geo_location"]]
data["state"] = [x[3:5] for x in data["geo_location"]]
data["loc_num"]= [x[6:] for x in data["geo_location"]]
data["loc_num"] = data["loc_num"].map(str)
print(data.head())

   display_id   ad_id  clicked            uuid  document_id  timestamp  \
0          16  273567        1  30c0ad12b36375      1727882       1033   
1          44  269739        1  7804739ae4f351      1747897       3157   
2         132  125384        0  69f8e7151d7204      1789548      10079   
3         136  150813        0  2028b0a2ad323e       225790      10443   
4         170  125375        0  b2778a037a571e      1773530      12840   

  platform geo_location  campaign_id  advertiser_id country state loc_num  
0        1    US>FL>561        25885            112      US    FL     561  
1        2    US>CA>825         6395            177      US    CA     825  
2        2    US>FL>534        16062           3001      US    FL     534  
3        1    US>CO>751        19188           2407      US    CO     751  
4        1    US>TN>640        16061           3001      US    TN     640  


In [64]:
list_to_binarize = ["advertiser_id", "campaign_id", "platform", "country","state"]

for i in list_to_binarize:
    temp = pd.get_dummies(data[i], prefix = i)
    data = data.join(temp)
data.head()

data.shape

(300788, 789)

In [67]:
clean = copy.deepcopy(data)
clean.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id,...,state_Y9,state_YT,state_Z1,state_Z2,state_Z3,state_Z4,state_Z5,state_Z6,state_Z7,state_Z8
0,16,273567,1,30c0ad12b36375,1727882,1033,1,US>FL>561,25885,112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,44,269739,1,7804739ae4f351,1747897,3157,2,US>CA>825,6395,177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,132,125384,0,69f8e7151d7204,1789548,10079,2,US>FL>534,16062,3001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,136,150813,0,2028b0a2ad323e,225790,10443,1,US>CO>751,19188,2407,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,170,125375,0,b2778a037a571e,1773530,12840,1,US>TN>640,16061,3001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
len(data["ad_id"].unique())

3239

In [82]:
len(clean['document_id'].unique())

69033

In [83]:
#drop the non-cleaned up data for now
del clean['country'], clean['state'], clean["ad_id"], clean["uuid"], clean["document_id"],clean["timestamp"], clean["platform"], clean["geo_location"], clean['campaign_id'], clean["advertiser_id"]

In [84]:
len(clean.columns)

779

## Make training and test sets

In [87]:
labels = clean['clicked']
labels = labels.reshape(-1,1)
del clean['clicked']


210551.59999999998

In [88]:
train_data = clean[:21000]
train_labels = labels[:21000]
test_data = clean[21000:]
test_labels = labels[21000:]

In [89]:
lr = LogisticRegression()
lr.fit(train_data, train_labels)
lr.predict(test_data, test_labels)

ValueError: could not convert string to float: 