In [37]:
# for sentiment analysis using bag of words
import gzip
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import sklearn
import string
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE

In [38]:
import pandas as pd
import requests
import json
import pickle

pd.set_option('display.max_rows', None)

In [39]:
#Reads in file

f = open("Ads5000", "rb")
ads = pickle.load(f)
f.close() 

In [40]:
df = pd.DataFrame(ads)

In [41]:
df.head()

Unnamed: 0,impressions,demographic_distribution,ad_creation_time,ad_delivery_start_time,delivery_by_region,estimated_audience_size,page_name,publisher_platforms,spend,id,ad_creative_link_description,ad_delivery_stop_time
0,"{'lower_bound': '1000', 'upper_bound': '1999'}","[{'percentage': '0.004557', 'age': '18-24', 'g...",2022-01-25,2022-01-25,"[{'percentage': '0.016578', 'region': 'Alabama...",{'lower_bound': '1000001'},MomsRising.org,"[facebook, instagram]","{'lower_bound': '0', 'upper_bound': '99'}",510093387113722,,
1,"{'lower_bound': '2000', 'upper_bound': '2999'}","[{'percentage': '0.000519', 'age': '25-34', 'g...",2022-01-25,2022-01-25,"[{'percentage': '0.026604', 'region': 'Alabama...",{'lower_bound': '1000001'},CPAC 2022,[facebook],"{'lower_bound': '0', 'upper_bound': '99'}",1010286123213947,,
2,"{'lower_bound': '0', 'upper_bound': '999'}",,2022-01-25,2022-01-25,,"{'lower_bound': '10001', 'upper_bound': '50000'}",David Livingston Scott County Magistrate- 7th ...,[facebook],"{'lower_bound': '0', 'upper_bound': '99'}",726465978738772,,
3,"{'lower_bound': '0', 'upper_bound': '999'}","[{'percentage': '0.003436', 'age': '18-24', 'g...",2022-01-25,2022-01-25,"[{'percentage': '1', 'region': 'Florida'}]",{'lower_bound': '1000001'},Miami's Community Newspapers,"[facebook, instagram]","{'lower_bound': '0', 'upper_bound': '99'}",1182151668859503,,
4,"{'lower_bound': '0', 'upper_bound': '999'}","[{'percentage': '0.021875', 'age': '25-34', 'g...",2022-01-21,2022-01-25,"[{'percentage': '0.025316', 'region': 'Alabama...",{'lower_bound': '1000001'},Pew Research Center,"[facebook, instagram]","{'lower_bound': '0', 'upper_bound': '99'}",466037478231880,,


In [57]:
# drops all ads with no description and turns it back into json 

df_with_only_descriptions = df.dropna()
df_json = df_with_only_descriptions.to_json(orient='records')
data = json.loads(df_json)

In [44]:
# counts number of unique words 

wordCount = defaultdict(int)
for d in data:
    for w in d['ad_creative_link_description'].split():
        wordCount[w] += 1

len(wordCount)

6405

In [56]:
# Removes capitallization / punctuations
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['ad_creative_link_description'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

len(wordCount)

4779

In [55]:
# With stemming. Removes different inflections of words.

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
  r = ''.join([c for c in d['ad_creative_link_description'].lower() if not c in punctuation])
  for w in r.split():
    w = stemmer.stem(w)
    wordCount[w] += 1
    
len(wordCount)

3874

In [47]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
  r = ''.join([c for c in d['ad_creative_link_description'].lower() if not c in punctuation])
  for w in r.split():
    wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [54]:
# discards extremely rare words
words = [x[1] for x in counts[:1000]]

# Sentiment Analysis 

In [49]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [50]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['ad_creative_link_description'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [51]:
X = [feature(d) for d in data]
y = [int(d['impressions']['lower_bound']) for d in data]

In [52]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [53]:
predictions

array([  5770.92918977,  22374.81427926,  -5097.34772491, ...,
        58827.12173621,  58827.12173621, 467982.08019867])


# N Grams

In [70]:
# Extract n-grams up to length 5 (same dataset as example above)

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['ad_creative_link_description'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
    ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
    ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
    for w in ws + ws2 + ws3 + ws4 + ws5:
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [59]:
words = [x[1] for x in counts[:1000]]

In [69]:
# A few of our 1000 most popular n-grams. Note the combination of n-grams of different lengths
words[200:210]

['a gift to ajws',
 'a gift to',
 '1231 and your 100 taxdeductible',
 '1231 and your 100',
 '1231 and your',
 '1231 and',
 '1231',
 '100 taxdeductible gift will be',
 '100 taxdeductible gift will',
 '100 taxdeductible gift']

In [61]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [62]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['ad_creative_link_description'].lower() if not c in punctuation])
    ws = r.split()
    ws2 = [' '.join(x) for x in list(zip(ws[:-1],ws[1:]))]
    ws3 = [' '.join(x) for x in list(zip(ws[:-2],ws[1:-1],ws[2:]))]
    ws4 = [' '.join(x) for x in list(zip(ws[:-3],ws[1:-2],ws[2:-1],ws[3:]))]
    ws5 = [' '.join(x) for x in list(zip(ws[:-4],ws[1:-3],ws[2:-2],ws[3:-1],ws[4:]))]
    for w in ws + ws2 + ws3 + ws4 + ws5:
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [64]:
# Same as the model in the previous example above, except using n-grams rather than just unigrams

X = [feature(d) for d in data]
y = [int(d['impressions']['lower_bound']) for d in data]

In [65]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [66]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

Some of the most negative and positive n-grams...

In [67]:
wordSort[:20]

[(-98043.30879170538, 'together we'),
 (-75438.66061442014, 'domestic'),
 (-69887.0719411429, 'vaccine mandates'),
 (-69726.24156172249, 'to provide'),
 (-63835.36882883463, 'both'),
 (-62757.826287633245, 'safe free'),
 (-62189.12375049295, 'a donation'),
 (-61603.05891771719, 'or make'),
 (-56227.95289931103, 'states'),
 (-55118.762467773944, 'join us'),
 (-53907.26427544211, 'whether'),
 (-53614.31298304524, 'make a donation'),
 (-53364.57613736314, 'stand up'),
 (-52600.66208324801, 'there is'),
 (-50968.03664273237, 'fox'),
 (-48939.88680654306, 'make a'),
 (-46780.74612438378, 'country'),
 (-46125.753116718835, 'during'),
 (-46016.03546730052, 'coronavirus'),
 (-45830.92502005108, 'parents to')]

In [71]:
wordSort[-20:]

[(48873.80262910532, 'in 2022'),
 (49633.61600079367, 'sweeping'),
 (52591.06487668196, 'council'),
 (52922.15283050836, 'hold'),
 (59324.96695603222, '👉'),
 (63615.68602856734, 'covid19 vaccines'),
 (63926.3249255196, 'public'),
 (67046.45052052898, 'we are'),
 (75935.97834156317, 'businesses'),
 (79627.22663241245, 'bidens'),
 (79743.82239967112, 'today'),
 (85408.73429876266, 'possible'),
 (93743.29100038276, 'questions'),
 (94871.01365132128, 'mandates'),
 (102765.88099914405, 'stand'),
 (115554.06938346254, 'future'),
 (133470.60798106572, 'provide'),
 (142086.1352081151, 'next'),
 (154875.9037090494, 'a new'),
 (154898.2553384335, 'supporting')]