#Outbrain Click Prediction 
The purpose of the Outbrain competition is to predict which recommended content each user will click. Outbrain has provided various relational datasets that need to be explored before any modeling can begin. 

###Import Libraries

###Raw Data 

In [1]:
import pandas as pd
import numpy as np
import os
import gc
import sys
import sqlite3 as sql
import matplotlib.pyplot as plt
from subprocess import check_output

%matplotlib inline

print('Files To Explore:')
for f in os.listdir('inputs/'):
    if 'csv' in f:
        print(f.ljust(30) + str(round(os.path.getsize('inputs/' + f)/1000000.0, 2)) + " MB")

Files To Explore:
clicks_test.csv               506.95 MB
clicks_train.csv              1486.73 MB
documents_categories.csv      118.02 MB
documents_entities.csv        324.1 MB
documents_meta.csv            89.38 MB
documents_topics.csv          339.47 MB
events.csv                    1208.55 MB
page_views_sample.csv         454.35 MB
promoted_content.csv          13.89 MB
sample_submission.csv         273.14 MB


####What are the dimensions of each dataset?

In [35]:
def file_len(fname):
    p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, 
                                              stderr=subprocess.PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    return int(result.strip().split()[0])

print('File Dimensions:')
for f in os.listdir('inputs/'):
    if 'csv' in f:
        rows=file_len('inputs/'+f)
        doc=pd.read_csv('inputs/'+f, nrows=1)
        print(f.ljust(30) + '('+str(rows)+','+str(len(doc.columns))+')')
        


File Dimensions:
clicks_test.csv               (32225163,2)
clicks_train.csv              (87141732,3)
documents_categories.csv      (5481476,3)
documents_entities.csv        (5537553,3)
documents_meta.csv            (2999335,4)
documents_topics.csv          (11325961,3)
events.csv                    (23120127,6)
page_views_sample.csv         (10000000,6)
promoted_content.csv          (559584,4)
sample_submission.csv         (6245534,2)


 ####Are there any common variables?

In [42]:
print('File Column Names:')
for f in os.listdir('inputs/'):
    if 'csv' in f:
        doc=pd.read_csv('inputs/'+f, nrows=1)
        print(f.ljust(30) + str(list(doc.columns.values)))
        

File Columns:
clicks_test.csv               ['display_id', 'ad_id']
clicks_train.csv              ['display_id', 'ad_id', 'clicked']
documents_categories.csv      ['document_id', 'category_id', 'confidence_level']
documents_entities.csv        ['document_id', 'entity_id', 'confidence_level']
documents_meta.csv            ['document_id', 'source_id', 'publisher_id', 'publish_time']
documents_topics.csv          ['document_id', 'topic_id', 'confidence_level']
events.csv                    ['display_id', 'uuid', 'document_id', 'timestamp', 'platform', 'geo_location']
page_views_sample.csv         ['uuid', 'document_id', 'timestamp', 'platform', 'geo_location', 'traffic_source']
promoted_content.csv          ['ad_id', 'document_id', 'campaign_id', 'advertiser_id']
sample_submission.csv         ['display_id', 'ad_id']


Looks like the disparate datasets can be combined through **display_id** and **document_id**.

###Review *Page View* Sample

In [48]:
pg_views=pd.read_csv('inputs/page_views_sample.csv')

In [49]:
pg_views.head()

Unnamed: 0,uuid,document_id,timestamp,platform,geo_location,traffic_source
0,1fd5f051fba643,120,31905835,1,RS,2
1,8557aa9004be3b,120,32053104,1,VN>44,2
2,c351b277a358f0,120,54013023,1,KR>12,1
3,8205775c5387f9,120,44196592,1,IN>16,2
4,9cb0ccd8458371,120,65817371,1,US>CA>807,2


In [52]:
pg_views.traffic_source.value_counts()

1    6668961
2    1667170
3    1663868
Name: traffic_source, dtype: int64

###Review *Clicks* Data

In [46]:
train=pd.read_csv('inputs/clicks_train.csv')
print('Train Shape:' + str(train.shape))
train.head()

Train Shape:(87141731, 3)


Unnamed: 0,display_id,ad_id,clicked
0,1,42337,0
1,1,139684,0
2,1,144739,1
3,1,156824,0
4,1,279295,0


In [47]:
train.display_id.value_counts()


10065389    12
6965076     12
13117200    12
4311099     12
14520899    12
7640681     12
12658842    12
1250191     12
4459940     12
5586843     12
9675262     12
14505968    12
633823      12
6718753     12
2358644     12
9445807     12
15043253    12
6695755     12
5493042     12
1790314     12
5220374     12
1850411     12
5999145     12
4706302     12
8889208     12
6310379     12
2084076     12
16099677    12
16373398    12
6390495     12
            ..
15505717     2
3113302      2
5589266      2
11112852     2
5541431      2
4886007      2
12254363     2
755158       2
2143626      2
6412839      2
12123355     2
6339979      2
7406902      2
7660888      2
6347271      2
10303650     2
10724840     2
14966577     2
10238082     2
8247719      2
16292789     2
5775361      2
14440496     2
7375543      2
15529562     2
3010169      2
16582998     2
1654931      2
6629280      2
9623577      2
Name: display_id, dtype: int64

In [78]:
print('Test Shape:'+str(test.shape))
test.head()

Test Shape:(32225162, 2)


Unnamed: 0,display_id,ad_id
0,16874594,66758
1,16874594,150083
2,16874594,162754
3,16874594,170392
4,16874594,172888


In [41]:
doc_cat=pd.read_csv('inputs/documents_categories.csv')
doc_ent=pd.read_csv('inputs/documents_entities.csv')
doc_meta=pd.read_csv('inputs/documents_meta.csv')
doc_topics=pd.read_csv('inputs/documents_topics.csv')

In [49]:
doc_cat.head()

Unnamed: 0,document_id,category_id,confidence_level
0,1595802,1611,0.92
1,1595802,1610,0.07
2,1524246,1807,0.92
3,1524246,1608,0.07
4,1617787,1807,0.92


In [50]:
doc_meta.head()

Unnamed: 0,document_id,source_id,publisher_id,publish_time
0,1595802,1,603,2016-06-05 00:00:00
1,1524246,1,603,2016-05-26 11:00:00
2,1617787,1,603,2016-05-27 00:00:00
3,1615583,1,603,2016-06-07 00:00:00
4,1615460,1,603,2016-06-20 00:00:00


In [93]:
try:del promoted_content
except:pass;gc.collect()
