# Mini-Data Set Preparation

After the Kaggle Script "Making a mini-data set" is run (FYI, it takes about 2 minutes to run) to reduce the size of the data to 40,000 instances, run this script to organize data into a single dataframe. 

Run this with the 8 csv files produced by the Kaggle Script in the same directory. 

Note: This is a Python3 script because that is what Kaggle uses. 

In [2]:
import pandas as pd
import numpy as np
import copy
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [3]:
clicks_train = pd.read_csv("mini_clicks_train.csv")#got
doc_cats = pd.read_csv("mini_doc_cats.csv")
doc_ents = pd.read_csv("mini_doc_ents.csv")
doc_meta = pd.read_csv("mini_doc_meta.csv")
doc_topics = pd.read_csv("mini_doc_topics.csv")
events = pd.read_csv("mini_events.csv") #got
#page_views = pd.read_csv("mini_page_views.csv") Once I get this imported
promoted = pd.read_csv("mini_promoted.csv")#got


## Join clicks_train and events on display_id

In [4]:
#clicks_train and events have a 1:1 relationship
print(len(events["display_id"].unique()))
print(len(clicks_train["display_id"].unique()))

39950
39950


In [5]:
#the first column seems to be the old index, we don't need this
clicks_train = clicks_train.set_index('display_id')
del clicks_train["Unnamed: 0"]
clicks_train.head()

Unnamed: 0_level_0,ad_id,clicked
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1
7194604,189575,1
10660899,281556,0
4510419,422667,0
2203285,11189,0
12632400,470488,0


In [5]:
del events["Unnamed: 0"]
events = events.set_index("display_id")
events.head()

Unnamed: 0_level_0,uuid,document_id,timestamp,platform,geo_location
display_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
62,65861840b3ab9f,1781623,4651,2,US>AZ>789
76,23c6c76aeab262,1727629,5723,3,US>IL>609
795,8cdc5a01e783a8,1768840,56758,3,US>PA>504
1394,f4953632cdeaf9,1762699,96740,2,US>MA>506
1523,cde539b7dfb671,1631112,105840,3,US>GA>524


In [6]:
data = clicks_train.join(events)
data.head()

Unnamed: 0.1,ad_id,clicked,Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location
62,132780,0,27826,27827,40c34e0c5b0826,1143040,2019895,2,US>SC>567
76,228959,0,31944,31945,2c866dc7af09d5,1259196,2352663,3,US>OH>558
795,249397,0,346882,346883,35b8f04d587848,1666933,31200225,2,US>ND>724
1394,310868,1,601272,601273,119077b6f3b659,394689,43563149,1,US>VA>573
1523,247283,0,656064,656065,6e4be33b7bfc18,1835877,46074088,2,US>CA>803


## Promoted

In [7]:
len(promoted)

14966

In [8]:
#there is not a one-to-one relationship between document_id in promoted and the master data
#This is because the same ad is being shown in different documents I think
print(len(promoted["document_id"].unique()))
print(len(data["document_id"].unique()))

9269
87


In [9]:
#there is a one-to-one relationship between ad_id in promoted and the master data
print(len(promoted["ad_id"].unique())) #each add can appear more than once
print(len(data["ad_id"].unique()))

14966
14966


In [10]:
promoted.head()
del promoted["Unnamed: 0"]
del promoted['document_id'] #I think all we want from here is the link between ad_id and campaign id
promoted.head()

Unnamed: 0,ad_id,campaign_id,advertiser_id
0,7,1,7
1,55,26,39
2,446,43,76
3,487,81,116
4,489,81,116


## Joining Info about each ad

I make a dictionary of the advertiser and campaign id for each ad_id, map that dictionary to the ad id to make the advertizer and campain columns

In [11]:
data.head()

Unnamed: 0.1,ad_id,clicked,Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location
62,132780,0,27826,27827,40c34e0c5b0826,1143040,2019895,2,US>SC>567
76,228959,0,31944,31945,2c866dc7af09d5,1259196,2352663,3,US>OH>558
795,249397,0,346882,346883,35b8f04d587848,1666933,31200225,2,US>ND>724
1394,310868,1,601272,601273,119077b6f3b659,394689,43563149,1,US>VA>573
1523,247283,0,656064,656065,6e4be33b7bfc18,1835877,46074088,2,US>CA>803


In [12]:
print(len(data))
print(len(data["ad_id"].unique())) #adds appear on average slightly more than twice in our minidata set

40000
14966


In [13]:
#make dictionaries to look up advertizer id and campaign id for each ad_id
advertiser_dict = dict(zip(promoted.ad_id, promoted.advertiser_id))
campaign_dict = dict(zip(promoted.ad_id, promoted.campaign_id))


In [14]:
data["campaign_id"] = data["ad_id"].map(campaign_dict)
data["advertiser_id"] = data["ad_id"].map(advertiser_dict)
data.head()

Unnamed: 0.1,ad_id,clicked,Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id
62,132780,0,27826,27827,40c34e0c5b0826,1143040,2019895,2,US>SC>567,16528,1006
76,228959,0,31944,31945,2c866dc7af09d5,1259196,2352663,3,US>OH>558,25174,2712
795,249397,0,346882,346883,35b8f04d587848,1666933,31200225,2,US>ND>724,26150,3829
1394,310868,1,601272,601273,119077b6f3b659,394689,43563149,1,US>VA>573,28807,1634
1523,247283,0,656064,656065,6e4be33b7bfc18,1835877,46074088,2,US>CA>803,26079,2874


In [15]:
print(len(data))
print(len(data["ad_id"].unique())) #adds appear on average slightly more than twice in our minidata set

40000
14966


## Working with Page Views

Can't get the pageviews file to import, will work on this later

## Importing Document Information

I'm super stuck on why all the document ids that appear in our data arent in the files with more information about each documents.

In [15]:
#Why aren't there the same number of unique documents in each of these
print(len(data["document_id"].unique()))
print(len(doc_cats["document_id"].unique()))
print(len(doc_ents["document_id"].unique()))
print(len(doc_meta["document_id"].unique()))
print(len(doc_topics["document_id"].unique()))

87
9253
7490
9269
9000


In [17]:
#each document has multiple possible entities, categories, topics with different confidence level. 
#maybe we should just for now keep the most likely entity, topic and category? 
doc_ents.head()

Unnamed: 0.1,Unnamed: 0,document_id,entity_id,confidence_level
0,42749,1097501,0bb6322f38719d3a928eb9ca6f6a4c96,0.704652
1,42750,1097501,e0e6f88bb45d82f216b7c794e8bc0a61,0.338807
2,42751,1097501,f513b9ee061a066d9595316fe50ba8e3,0.314879
3,72655,1807706,11d5279de0b36011773ee50f75f9b43c,0.495961
4,72656,1807706,358c9e8a2ff80f858f6deb063e0bcf8f,0.337521


In [16]:
doc_cats.head()

Unnamed: 0.1,Unnamed: 0,document_id,category_id,confidence_level
0,25674,1097501,1100,0.92
1,25675,1097501,2005,0.07
2,51976,1807706,1209,0.92
3,51977,1807706,1205,0.07
4,51990,1316839,2003,0.92


In [17]:
data.head()

Unnamed: 0.1,ad_id,clicked,Unnamed: 0,display_id,uuid,document_id,timestamp,platform,geo_location,campaign_id,advertiser_id
62,132780,0,27826,27827,40c34e0c5b0826,1143040,2019895,2,US>SC>567,16528,1006
76,228959,0,31944,31945,2c866dc7af09d5,1259196,2352663,3,US>OH>558,25174,2712
795,249397,0,346882,346883,35b8f04d587848,1666933,31200225,2,US>ND>724,26150,3829
1394,310868,1,601272,601273,119077b6f3b659,394689,43563149,1,US>VA>573,28807,1634
1523,247283,0,656064,656065,6e4be33b7bfc18,1835877,46074088,2,US>CA>803,26079,2874


In [20]:
data["geo_location"] = data["geo_location"].apply(str)

In [None]:
data["country"] = [x[:2] for x in data["geo_location"]]
data["state"] = [x[3:5] for x in data["geo_location"]]
data["loc_num"]= [x[6:] for x in data["geo_location"]]
data["loc_num"] = data["loc_num"].map(str)
list_to_binarize = ["advertiser_id", "campaign_id", "platform", "country","state"]
for i in list_to_binarize:
    temp = pd.get_dummies(data[i], prefix = i)
    data = data.join(temp)
data.head()

In [None]:
clean = copy.deepcopy(data)
clean.head()

In [None]:
len(data["ad_id"].unique())

In [None]:
len(clean['document_id'].unique())

In [None]:
#drop the non-cleaned up data for now
del clean['country'], clean['state'], clean["ad_id"], clean["uuid"], clean["document_id"],clean["timestamp"], clean["platform"], clean["geo_location"], clean['campaign_id'], clean["advertiser_id"]

In [None]:
len(clean.columns)

## Make training and test sets

In [None]:
labels = clean['clicked']
labels = labels.reshape(-1,1)
del clean['clicked']

In [None]:
train_data = clean[:35000]
train_labels = labels[:35000]
test_data = clean[35000:]
test_labels = labels[35000:]

In [None]:
lr = LogisticRegression()
lr.fit(train_data, train_labels)
lr.predict(test_data, test_labels)