# Mini-Data Set Preparation

At the end of running the data prep notebook here, you will have training, development and test data sets

dev_data -> the development split of the training data
dev_labels -> labels for that

train_data -> the training split of the training_data
train_lables -> the labels for that

test_data -> THE ACTUAL CLICKS TEST TEST data!

Things we've discovered:
* Multiple ad_id per display_id
* Multiple display_id per document_id
* Ad_id can be in multiple display_id and multiple document_id
* Only one ad_id per display_id is clicked

In [7]:
import pandas as pd
import numpy as np
import copy
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [8]:
clicks_train_og = pd.read_csv("clicks_train.csv")
promoted_content_og = pd.read_csv("promoted_content.csv")
doc_cats_og = pd.read_csv("documents_categories.csv")
doc_ents_og = pd.read_csv("documents_entities.csv")
doc_meta_og = pd.read_csv("documents_meta.csv")
doc_topics_og = pd.read_csv("documents_topics.csv")
events_og = pd.read_csv("events.csv")
page_views_og = pd.read_csv("page_views_sample.csv")
clicks_test_og = pd.read_csv("clicks_test.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
doc_ids = set(page_views_og['document_id']) & set(promoted_content_og['document_id'])
# pull in the content that is in both page_views and promoted_content
events = events_og[events_og['document_id'].isin(doc_ids)]
clicks_train = clicks_train_og[clicks_train_og['display_id'].isin(events['display_id'])]
events = events[events['display_id'].isin(clicks_train['display_id'])]
promoted_content = promoted_content_og[promoted_content_og['ad_id'].isin(clicks_train['ad_id'])]
doc_cats = doc_cats_og[doc_cats_og['document_id'].isin(promoted_content['document_id'])]
doc_ents = doc_ents_og[doc_ents_og['document_id'].isin(promoted_content['document_id'])]
doc_meta = doc_meta_og[doc_meta_og['document_id'].isin(promoted_content['document_id'])]
doc_topics = doc_topics_og[doc_topics_og['document_id'].isin(promoted_content['document_id'])]
page_views = page_views_og[page_views_og['document_id'].isin(events['document_id'])]

## Make master data merging all features to clicks_train

### Merge information about the displays to master dataset
Events are only if the user CLICKED. This dataset will bring in information about the display_id's from events

In [35]:
def click_percent(dataset, ad_id, default_result, reg):
    '''Returns the posterior probability of ad being clicked.
    If ad has not been encountered before, assume mean click
    
    dataset is the TRAINING data'''
    
    # count number of times ad has been seen
    ad_total = len(dataset[dataset['ad_id'] == ad_id])
    
    # if ad has not been seen, returned the default_results
    if ad_total == 0:
        return default_result
    # otherwise return percentage of times ad has been clicked, adjusted by a regularization term
    else:
        click_sum = np.sum(dataset[dataset['ad_id'] == ad_id].clicked) + 1.0
        return click_sum / (ad_total + reg)
    
def format_data(dataset):

    # Merging information aout the displays to master dataset
    data = dataset.merge(events, on='display_id', how='left')
    # joins information about the display that the user saw
    # each display has a unique user id, doc id, and timestamp
    # events has the information about the display (who the user is, which site (document_id) it was on, when it was seen, from where, etc.)

    # Identifying which documents the ads refer to (aka destination documents)

    data = data.merge(promoted_content, on='ad_id', how='left')

    # Gather/bin data about the documents the ads refer to

    sparsetop = doc_topics.pivot(index='document_id', 
                                 columns='topic_id', 
                                 values='confidence_level')
    sparsetop.columns = ['top_' + str(col) for col in sparsetop.columns]

    sparsecat = doc_cats.pivot(index='document_id', 
                               columns='category_id', 
                               values='confidence_level')
    sparsecat.columns = ['cat_' + str(col) for col in sparsecat.columns]

    sparse = sparsetop.join(sparsecat, how='outer')
    sparse.fillna(0, inplace=True)

    sparse.reset_index(level=0, inplace=True)

    data = data.merge(sparse, 
                      left_on='document_id_y', 
                      right_on='document_id', 
                      how='left')
    
    # Adding meta data about the advertiser and campaign successes
    if 'clicked' in clicks_train.columns:
        advr_success = dict(zip(data.advertiser_id.unique(), 
                                [sum(data[data['advertiser_id']==x]['clicked'])/len(data[data['advertiser_id']==x]) for x in data['advertiser_id'].unique()]))
        camp_success = dict(zip(data.campaign_id.unique(), 
                                [sum(data[data['campaign_id']==x]['clicked'])/len(data[data['campaign_id']==x]) for x in data['campaign_id'].unique()]))

        data['campaign_perc'] = data['campaign_id'].map(camp_success)
        data['advertiser_perc'] = data['advertiser_id'].map(advr_success)

        doc_view_freq = dict(zip(page_views.document_id.unique(), [len(page_views[page_views.document_id==x]) for x in page_views.document_id.unique()]))
        data['docx_view_freq'] = data['document_id_x'].map(doc_view_freq)

        # Adding meta data about prior click percentage
        mean_click = np.mean(dataset["clicked"])
        click_success = dict(zip(data.ad_id.unique(), [click_percent(dataset, x, mean_click, 10.0) for x in dataset["ad_id"].unique()] ))
        data['click_perc'] = data['ad_id'].map(click_success)
    
    return data
    
    
click_train_data = format_data(clicks_train)
#data_test = format_data(clicks_test) # doesn't work with click_test yet


In [36]:
click_train_data.head()

Unnamed: 0,display_id,ad_id,clicked,uuid,document_id_x,timestamp,platform,geo_location,document_id_y,campaign_id,...,cat_2002,cat_2003,cat_2004,cat_2005,cat_2006,cat_2100,campaign_perc,advertiser_perc,docx_view_freq,click_perc
0,37,70153,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,933716,7516,...,0,0,0,0,0,0,0.045455,0.046069,7701,0.052632
1,37,149047,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1169985,16636,...,0,0,0,0,0,0,0.137931,0.137931,7701,0.076923
2,37,169564,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1394819,20109,...,0,0,0,0,0,0,0.0,0.0,7701,0.071429
3,37,234713,1,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1586431,245,...,0,0,0,0,0,0,0.265217,0.271255,7701,0.28125
4,37,235443,0,d4f62cdcb39ad8,1779285,2687,2,US>WA>819,1377696,11654,...,0,0,0,0,0,0,0.043478,0.082631,7701,0.043478


Now we are merging information on what documents the ads referred to (from source: promoted_content).  
In every display, there are multiple ads (within one document = document_id_x). Every ad refers to a different document, which is the site the ad is promoting (document_id_y). All the columns after document_id_y are information about that document (to which the ad is referring).

### Merge information about the documents the ads refer to
All the doc files have information about the documents (websites) to which the ads refer to
including confidence levels of which topics the ads referred to, which categories they're apart of, etc.

We wanted to duplicate the idea of the CountVectorizer for the 'bag of words' model we used for spam detection, but since we're not counting words in a text, it's a little bit different. Since we have a 'dictionary' of categories and topics, we use that as our 'vocabulary.' Every document has a confidence level for one or more items in the vocabulary, so we create a sparse matrix with every topic and category as columns, and every document has a confidence level value in the respective columns. If they are not given a confidence level, we put 0 because the document most likely does not have anything to do with that category or topic (given the data provided by Outbrain).

This data on the documents will help us separate ads from one another based on topic/category.  
ie) why did ad A get clicked instead of ad B? We know ad A referred to document 1 whereas ad B referred to document 2, and now we have general information about the documents the ads referred to. We will merge this information in later steps.

creating dictionaries for % of ads clicked for every advertiser and campaign.  
purpose: merge to master dataset as a feature for every ad, how often the advertiser and campaign are successful on average.

Add count of page views to every document that an ad appears in (document_id_x) as a feature, could tell us something about likelihood of ads being clicked

# Click_train Predictions

In [12]:
doc_ids = set(page_views_og['document_id']) & set(promoted_content_og['document_id'])
# pull in the content that is in both page_views and promoted_content
events_test = events_og[events_og['document_id'].isin(doc_ids)]
clicks_test = clicks_test_og[clicks_test_og['display_id'].isin(events_test['display_id'])]
events_test = events_test[events_test['display_id'].isin(clicks_test['display_id'])]
promoted_content_test = promoted_content_og[promoted_content_og['ad_id'].isin(clicks_test['ad_id'])]
doc_cats_test = doc_cats_og[doc_cats_og['document_id'].isin(promoted_content_test['document_id'])]
doc_ents_test = doc_ents_og[doc_ents_og['document_id'].isin(promoted_content_test['document_id'])]
doc_meta_test = doc_meta_og[doc_meta_og['document_id'].isin(promoted_content_test['document_id'])]
doc_topics_test = doc_topics_og[doc_topics_og['document_id'].isin(promoted_content_test['document_id'])]
page_views_test = page_views_og[page_views_og['document_id'].isin(events_test['document_id'])]

In [13]:
def click_percent(dataset, ad_id, default_result, reg):
    '''Returns the posterior probability of ad being clicked.
    If ad has not been encountered before, assume mean click
    
    dataset is the TRAINING data'''
    
    # count number of times ad has been seen
    ad_total = len(dataset[dataset['ad_id'] == ad_id])
    
    # if ad has not been seen, returned the default_results
    if ad_total == 0:
        return default_result
    # otherwise return percentage of times ad has been clicked, adjusted by a regularization term
    else:
        click_sum = np.sum(dataset[dataset['ad_id'] == ad_id].clicked) + 1.0
        return click_sum / (ad_total + reg)
    
def format_test_data(dataset, train):

    # Merging information aout the displays to master dataset
    data = dataset.merge(events_test, on='display_id', how='left')
    # joins information about the display that the user saw
    # each display has a unique user id, doc id, and timestamp
    # events has the information about the display (who the user is, which site (document_id) it was on, when it was seen, from where, etc.)

    # Identifying which documents the ads refer to (aka destination documents)

    data = data.merge(promoted_content_test, on='ad_id', how='left')

    # Gather/bin data about the documents the ads refer to

    sparsetop = doc_topics.pivot(index='document_id', 
                                 columns='topic_id', 
                                 values='confidence_level')
    sparsetop.columns = ['top_' + str(col) for col in sparsetop.columns]

    sparsecat = doc_cats.pivot(index='document_id', 
                               columns='category_id', 
                               values='confidence_level')
    sparsecat.columns = ['cat_' + str(col) for col in sparsecat.columns]

    sparse = sparsetop.join(sparsecat, how='outer')
    sparse.fillna(0, inplace=True)

    sparse.reset_index(level=0, inplace=True)

    data = data.merge(sparse, 
                      left_on='document_id_y', 
                      right_on='document_id', 
                      how='left')
    
    # Adding meta data about the advertiser and campaign successes
    advr_success = dict(zip(data.advertiser_id.unique(), 
                            [(sum(train[train['advertiser_id']==x]['clicked'])/len(train[train['advertiser_id']==x])) if (len(train[train['advertiser_id']==x]) != 0) else np.mean(train['clicked']) for x in data['advertiser_id'].unique()]))
    camp_success = dict(zip(data.campaign_id.unique(), 
                            [sum(train[train['campaign_id']==x]['clicked'])/len(train[train['campaign_id']==x]) if (len(train[train['campaign_id']==x]) != 0) else np.mean(train['clicked']) for x in data['campaign_id'].unique()]))

    data['campaign_perc'] = data['campaign_id'].map(camp_success)
    data['advertiser_perc'] = data['advertiser_id'].map(advr_success)

    doc_view_freq = dict(zip(page_views_test.document_id.unique(), [len(page_views_test[page_views_test.document_id==x]) for x in page_views_test.document_id.unique()]))
    data['docx_view_freq'] = data['document_id_x'].map(doc_view_freq)

    # Adding meta data about prior click percentage
    mean_click = np.mean(train["clicked"])
    click_success = dict(zip(data.ad_id.unique(), [click_percent(train, x, mean_click, 10.0) for x in dataset["ad_id"].unique()] ))
    data['click_perc'] = data['ad_id'].map(click_success)
    
    return data
    
    
test_data = format_test_data(clicks_test, click_train_data)

In [14]:
test_data.head()

Unnamed: 0,display_id,ad_id,uuid,document_id_x,timestamp,platform,geo_location,document_id_y,campaign_id,advertiser_id,...,cat_2002,cat_2003,cat_2004,cat_2005,cat_2006,cat_2100,campaign_perc,advertiser_perc,docx_view_freq,click_perc
0,16875069,57252,bc19371a0fb69b,1649400,182688,3,US>CA>803,758162,7659,285,...,0,0,0,0,0,0,0.0,0.114253,9943,0.052632
1,16875069,61159,bc19371a0fb69b,1649400,182688,3,US>CA>803,992370,7283,1919,...,0,0,0,0,0,0,0.011401,0.013349,9943,0.014599
2,16875069,132857,bc19371a0fb69b,1649400,182688,3,US>CA>803,1160307,17030,285,...,0,0,0,0,0,0,0.144444,0.114253,9943,0.140449
3,16875069,153427,bc19371a0fb69b,1649400,182688,3,US>CA>803,987161,19525,1913,...,0,0,0,0,0,0,0.071542,0.044244,9943,0.075163
4,16875069,228959,bc19371a0fb69b,1649400,182688,3,US>CA>803,1141938,25174,2712,...,0,0,0,0,0,0,0.230007,0.221762,9943,0.229416


# Make training and test sets

In [37]:
def dev_train_split(data, train_percent = False):
    # Splitting dataset into data and labels

    labels = data['clicked']
    labels = labels.values.reshape(-1,1)
    del data['clicked']

    print('Labels length:', len(labels))
    print('data length:', data.shape)

    # Making training and test set splits
    # if not split defined, assume no split desired.
    # split_percent = 1 means test results will be empty
    if not train_percent:
        train_percent = 1
        
    train_data = data[:int(train_percent*len(data))]
    dev_data = data[int(train_percent*len(data)):]

    train_labels = labels[:int(train_percent*len(data))]
    dev_labels = labels[int(train_percent*len(data)):]

    print('training label shape: ', train_labels.shape)
    print('training data shape: ' , train_data.shape)
    print('dev label shape:', dev_labels.shape)
    print ('dev data shape:', dev_data.shape)
    
    return train_data, train_labels, dev_data, dev_labels

train_data, train_labels, dev_data, dev_labels = dev_train_split(click_train_data, train_percent = 0.7)

Labels length: 282205
data length: (282205, 410)
training label shape:  (197543, 1)
training data shape:  (197543, 410)
dev label shape: (84662, 1)
dev data shape: (84662, 410)


## What the files are now called:
dev_data -> the development split of the training data
dev_labels -> labels for that

train_data -> the training split of the training_data
train_lables -> the labels for that

test_data -> THE ACTUAL CLICKS TEST TEST data!


In [39]:
test_data.head()

Unnamed: 0,display_id,ad_id,uuid,document_id_x,timestamp,platform,geo_location,document_id_y,campaign_id,advertiser_id,...,cat_2002,cat_2003,cat_2004,cat_2005,cat_2006,cat_2100,campaign_perc,advertiser_perc,docx_view_freq,click_perc
197543,12392612,332065,f7bc18373c9d55,1854517,824018058,2,GB>J9,1710793,29309,210,...,0,0,0,0,0,0,0.044444,0.0664,222,0.037736
197544,12392612,377111,f7bc18373c9d55,1854517,824018058,2,GB>J9,1913629,104,162,...,0,0,0,0,0,0,0.061311,0.061324,222,0.05
197545,12392612,466261,f7bc18373c9d55,1854517,824018058,2,GB>J9,2288852,246,232,...,0,0,0,0,0,0,0.4,0.271255,222,0.090909
197546,12392612,489002,f7bc18373c9d55,1854517,824018058,2,GB>J9,2392456,104,162,...,0,0,0,0,0,0,0.061311,0.061324,222,0.090909
197547,12392620,86365,52990e5aa41541,1649400,824018294,3,GB>H3,1105913,11172,2222,...,0,0,0,0,0,0,0.202265,0.252889,9943,0.201867


# Following Homework2

In [40]:
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [41]:
def lr_prediction(dev_data, dev_labels, train_data, train_labels):
    '''Returns the array of display_id, ad_id and probability it will be clicked'''
    lr = LogisticRegression()
    lr.fit(train_data[train_data.columns[11:]].fillna(0), train_labels)
    lr_click_prob = lr.predict_proba(dev_data[dev_data.columns[11:]].fillna(0))[:,1]
    lr_score = lr.score(dev_data[dev_data.columns[11:]].fillna(0), dev_labels)
    return pd.DataFrame({'display_id': dev_data['display_id'],
                         'ad_id': dev_data['ad_id'],
                         'prediction': lr_click_prob
                        })

In [44]:
def lr_test_prediction(test_data, train_data, train_labels):
    '''As above but works on the unlabeled test data
    Returns the array of display_id, ad_id and probability it will be clicked'''
    lr = LogisticRegression()
    lr.fit(train_data[train_data.columns[11:]].fillna(0), train_labels)
    lr_click_prob = lr.predict_proba(dev_data[dev_data.columns[11:]].fillna(0))[:,1]
    return pd.DataFrame({'display_id': dev_data['display_id'],
                         'ad_id': dev_data['ad_id'],
                         'prediction': lr_click_prob
                        })

In [49]:
lr_output = lr_test_prediction(test_data, train_data, train_labels)
lr_output.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,ad_id,display_id,prediction
197543,332065,12392612,0.074205
197544,377111,12392612,0.071809
197545,466261,12392612,0.245085
197546,489002,12392612,0.087257
197547,86365,12392620,0.197833


In [61]:
# Format the data the way the submission requires
lr_output.sort_values(['display_id','prediction'],inplace=True,ascending=False)
output=lr_output.groupby(['display_id'])['ad_id'].apply(lambda x:' '.join(map(str,x))).reset_index()
output.head()
# That's it for the simple solution (prior expectation)!
output.to_csv('simple_test_solution.cvs',index=False)