In [37]:
# for sentiment analysis using bag of words
import gzip
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import sklearn
import string
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE

In [38]:
import pandas as pd
import requests
import json
import pickle

pd.set_option('display.max_rows', None)

In [39]:
#Reads in file

f = open("Ads5000", "rb")
ads = pickle.load(f)
f.close() 

In [40]:
df = pd.DataFrame(ads)

In [41]:
df.head()

Unnamed: 0,impressions,demographic_distribution,ad_creation_time,ad_delivery_start_time,delivery_by_region,estimated_audience_size,page_name,publisher_platforms,spend,id,ad_creative_link_description,ad_delivery_stop_time
0,"{'lower_bound': '1000', 'upper_bound': '1999'}","[{'percentage': '0.004557', 'age': '18-24', 'g...",2022-01-25,2022-01-25,"[{'percentage': '0.016578', 'region': 'Alabama...",{'lower_bound': '1000001'},MomsRising.org,"[facebook, instagram]","{'lower_bound': '0', 'upper_bound': '99'}",510093387113722,,
1,"{'lower_bound': '2000', 'upper_bound': '2999'}","[{'percentage': '0.000519', 'age': '25-34', 'g...",2022-01-25,2022-01-25,"[{'percentage': '0.026604', 'region': 'Alabama...",{'lower_bound': '1000001'},CPAC 2022,[facebook],"{'lower_bound': '0', 'upper_bound': '99'}",1010286123213947,,
2,"{'lower_bound': '0', 'upper_bound': '999'}",,2022-01-25,2022-01-25,,"{'lower_bound': '10001', 'upper_bound': '50000'}",David Livingston Scott County Magistrate- 7th ...,[facebook],"{'lower_bound': '0', 'upper_bound': '99'}",726465978738772,,
3,"{'lower_bound': '0', 'upper_bound': '999'}","[{'percentage': '0.003436', 'age': '18-24', 'g...",2022-01-25,2022-01-25,"[{'percentage': '1', 'region': 'Florida'}]",{'lower_bound': '1000001'},Miami's Community Newspapers,"[facebook, instagram]","{'lower_bound': '0', 'upper_bound': '99'}",1182151668859503,,
4,"{'lower_bound': '0', 'upper_bound': '999'}","[{'percentage': '0.021875', 'age': '25-34', 'g...",2022-01-21,2022-01-25,"[{'percentage': '0.025316', 'region': 'Alabama...",{'lower_bound': '1000001'},Pew Research Center,"[facebook, instagram]","{'lower_bound': '0', 'upper_bound': '99'}",466037478231880,,


In [42]:
# drops all ads with no description and turns it back into json 
df_with_only_descriptions = df.dropna()
df_json = df_with_only_descriptions.to_json(orient='records')
data = json.loads(df_json)

In [44]:
# counts number of unique words 

wordCount = defaultdict(int)
for d in data:
    for w in d['ad_creative_link_description'].split():
        wordCount[w] += 1

len(wordCount)

6405

In [45]:
# Ignore capitalization and remove punctuation
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['ad_creative_link_description'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

len(wordCount)

4779

In [None]:
# with stemming 

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in data:
  r = ''.join([c for c in d['ad_creative_link_description'].lower() if not c in punctuation])
  for w in r.split():
    w = stemmer.stem(w)
    wordCount[w] += 1
    
len(wordCount)

Just build our feature vector by taking the most popular words (lowercase, punctuation removed, but no stemming)

In [13]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
  r = ''.join([c for c in d['ad_creative_link_description'].lower() if not c in punctuation])
  for w in r.split():
    wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [14]:
words = [x[1] for x in counts[:1000]]

# Sentiment Analysis 

In [15]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [16]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['ad_creative_link_description'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [17]:
X = [feature(d) for d in data]
y = [int(d['impressions']['lower_bound']) for d in data]

In [18]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)