In [1]:
import pickle
from datetime import datetime
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica'],'size':8})
matplotlib.rcParams['pdf.fonttype'] = 42


In [14]:
# load data
file_in = open("all_ads_labeled.pickle", "rb")
dat = pickle.load(file_in)

print("number of ads", dat.shape)

number of ads (602544, 17)


In [2]:
data = pd.read_csv("ad_data.csv")

In [3]:
print("number of ads", data.shape)

number of ads (602544, 17)


In [33]:
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
pd.reset_option("display.max_colwidth")

In [41]:
data.tail(5)

Unnamed: 0,ad_creation_time,ad_creative_body,spend,impressions,delivery_by_region,demographic_distribution,page_id,page_name,bylines,id,spend_lo,spend_hi,impressions_lo,impressions_hi,label,party,state
602539,2018-08-30,It’s Mike’s Birthday and he wants a clean plan...,49.5,499.5,"[{'percentage': '1', 'region': 'Pennsylvania'}]","[{'percentage': '0.002494', 'age': '18-24', 'g...",174750213023790,"Mike Doyle for PA Representative, 170th District",Friends To Elect Mike Doyle,228443758015782,0,99,0,999,non-climate,Democrat,PA
602540,2018-08-30,"Send Mike a ""birthday"" gift for his 41st birth...",49.5,1499.5,"[{'percentage': '1', 'region': 'Pennsylvania'}]","[{'percentage': '0.001106', 'age': '18-24', 'g...",174750213023790,"Mike Doyle for PA Representative, 170th District",Friends To Elect Mike Doyle,470785400088043,0,99,1000,1999,non-climate,Democrat,PA
602541,2018-08-24,Mike's turning 41! Help him celebrate his birt...,49.5,499.5,"[{'percentage': '1', 'region': 'Pennsylvania'}]","[{'percentage': '0.003205', 'age': '18-24', 'g...",174750213023790,"Mike Doyle for PA Representative, 170th District",Friends To Elect Mike Doyle,382470918955364,0,99,0,999,non-climate,Democrat,PA
602542,2018-08-22,Hope to see everyone tonight!,49.5,499.5,"[{'percentage': '1', 'region': 'Pennsylvania'}]","[{'percentage': '0.066806', 'age': '25-34', 'g...",174750213023790,"Mike Doyle for PA Representative, 170th District",Friends To Elect Mike Doyle,457482704660129,0,99,0,999,non-climate,Democrat,PA
602543,2021-10-08,I am committed to helping middle-class familie...,449.5,12499.5,"[{'percentage': '1', 'region': 'California'}]","[{'percentage': '0.001772', 'age': '65+', 'gen...",182567716746,Congressman John Garamendi,Office of Congressman John Garamendi,970233850221789,400,499,10000,14999,non-climate,Democrat,CA


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602544 entries, 0 to 602543
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ad_creation_time          602544 non-null  object 
 1   ad_creative_body          598057 non-null  object 
 2   spend                     602544 non-null  float64
 3   impressions               602544 non-null  float64
 4   delivery_by_region        463401 non-null  object 
 5   demographic_distribution  463401 non-null  object 
 6   page_id                   602544 non-null  int64  
 7   page_name                 602544 non-null  object 
 8   bylines                   600813 non-null  object 
 9   id                        602544 non-null  int64  
 10  spend_lo                  602544 non-null  int64  
 11  spend_hi                  602544 non-null  int64  
 12  impressions_lo            602544 non-null  int64  
 13  impressions_hi            602544 non-null  i

In [36]:
data.isnull().sum()

ad_creation_time                 0
ad_creative_body              4487
spend                            0
impressions                      0
delivery_by_region          139143
demographic_distribution    139143
page_id                          0
page_name                        0
bylines                       1731
id                               0
spend_lo                         0
spend_hi                         0
impressions_lo                   0
impressions_hi                   0
label                            0
party                            0
state                            0
dtype: int64

In [37]:
data.nunique()

ad_creation_time              1238
ad_creative_body             60635
spend                           42
impressions                     39
delivery_by_region          337533
demographic_distribution    462909
page_id                        520
page_name                      555
bylines                        957
id                          602544
spend_lo                        42
spend_hi                        42
impressions_lo                  39
impressions_hi                  39
label                            2
party                            3
state                           54
dtype: int64

## Cleaning the data

In [12]:
# Removing nans and empty strings (ad_creative_body, demographic_distribution, delivery_by_region)

# filter out nans in creative sentiment (e.g. they are probably videos), removes 4487 ads (0.74%)
data = data[~data['ad_creative_body'].isna()]
# filter out ads which just have empty space as text, e.g '  '(no 'real' text content), removes 40 in total out of (602544)
data = data[data['ad_creative_body'].str.strip() != '']
# filter out ads that do not have any targeting info (removes 138289 ads - 22.95% of all ads) - see analysis below
#data = data[~((data['demographic_distribution'].isna()) | (data['delivery_by_region'].isna()))]

# remove ads where demographic_distribution = [{'percentage': '1'}], not sure what that means, removes 6 ads
data = data[data['demographic_distribution'].map(lambda x: 1 if x == [{'percentage': '1'}] else 0) == 0]

# relabel Independents as Democrats - as the two independent politicians caucus with the Democrats
data['party'].replace('Independent','Democrat',inplace=True)

In [13]:
print("Number of ads after basic cleaning", data.shape)

Number of ads after basic cleaning (598017, 17)


In [45]:
# remove all adds with low spend and low impressions

#dat = data[(data["spend_lo"] != 0) & (data["impressions_lo"] != 0)]

#print("Number of ads part 2", dat.shape)

Number of ads part 2 (139944, 17)


In [46]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 139944 entries, 67 to 602543
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ad_creation_time          139944 non-null  object 
 1   ad_creative_body          139944 non-null  object 
 2   spend                     139944 non-null  float64
 3   impressions               139944 non-null  float64
 4   delivery_by_region        139944 non-null  object 
 5   demographic_distribution  139944 non-null  object 
 6   page_id                   139944 non-null  int64  
 7   page_name                 139944 non-null  object 
 8   bylines                   139663 non-null  object 
 9   id                        139944 non-null  int64  
 10  spend_lo                  139944 non-null  int64  
 11  spend_hi                  139944 non-null  int64  
 12  impressions_lo            139944 non-null  int64  
 13  impressions_hi            139944 non-null  int64