In [1]:
import gzip
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE

In [2]:
import pandas as pd
import requests
import json
import pickle

pd.set_option('display.max_rows', None)

In [3]:
#Reads in Ads10000_election file

f = open("Ads10000_election", "rb")
ads_election = pickle.load(f)
f.close() 

In [4]:
#Reads in Ads10000_vaccine file

g = open("Ads10000_vaccine", "rb")
ads_vaccine = pickle.load(g)
g.close() 

In [5]:
df_election = pd.DataFrame(ads_election)
df_election.head()

Unnamed: 0,ad_creation_time,ad_creative_bodies,ad_creative_link_captions,ad_creative_link_descriptions,ad_creative_link_titles,ad_delivery_start_time,ad_delivery_stop_time,estimated_audience_size,impressions,spend,id,delivery_by_region,demographic_distribution
0,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-04,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",711213506429873,,
1,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-01,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",127634835910715,,
2,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-04,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",3821823704574505,,
3,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-03,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",745901799329077,,
4,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-03,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",172031037686911,,


In [6]:
# drops all ads with no description and turns it back into json 

def df_to_json(df):
    # drops all NaN data
    dropped_NaN = df.dropna()
    
    # converts df to json
    json = dropped_NaN.to_json(orient='records')
    dataset = eval(json)
    return dataset

In [7]:
# json of df_election
election_dataset = df_to_json(df_election)
len(election_dataset)

4131

In [8]:
# lowering and removing punctuation 
# counts number of occurences of each word, stored in wordCount
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in election_dataset:
    r = ''.join([c for c in d['ad_creative_bodies'][0].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1
len(wordCount)

6543

In [9]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [10]:
words = [x[1] for x in counts[:]]

In [11]:
# gets frequency of each word in the entire dataset 
df = defaultdict(int)
for d in election_dataset:
    r = ''.join([c for c in d['ad_creative_bodies'][0].lower() if not c in punctuation])
    for w in set(r.split()):
        df[w] += 1

In [12]:
rev = election_dataset[0]

Getting TF - IDF for a single document

In [13]:
tf = defaultdict(int)
r = ''.join([c for c in rev['ad_creative_bodies'][0].lower() if not c in punctuation])
for w in r.split():
    # Note = rather than +=, different versions of tf could be used instead
    tf[w] = 1
    
tfidf = dict(zip(words,[tf[w] * math.log2(len(election_dataset) / df[w]) for w in words]))
tfidfQuery = [tf[w] * math.log2(len(election_dataset) / df[w]) for w in words]

In [14]:
maxTf = [(tf[w],w) for w in words]
maxTf.sort(reverse=True)
maxTfIdf = [(tfidf[w],w) for w in words]
maxTfIdf.sort(reverse=True)

In [15]:
# words with highest TF-IDF
maxTfIdf[:10]

[(12.01227534485612, 'tennessee’s'),
 (12.01227534485612, 'tennesseans'),
 (12.01227534485612, 'tdh'),
 (12.01227534485612, 'reminds'),
 (12.01227534485612, 'prioritizes'),
 (12.01227534485612, 'metropolitan'),
 (12.01227534485612, 'instructions'),
 (12.01227534485612, 'httpscovid19tngovcovid19vaccinesvaccinephases'),
 (12.01227534485612,
  'httpscovid19tngovcovid19vaccinescountyvaccineinformation'),
 (12.01227534485612, 'feb')]

In [16]:
maxTfIdf[:-10:-1]
range(10)

range(0, 10)

In [17]:
my_dict = {}
stopwords = ['http','html','www','\\']

for x in range(len(election_dataset)):
    
    rev = election_dataset[x]
    tf = defaultdict(int)
    r = ''.join([c for c in rev['ad_creative_bodies'][0].lower() if not c in punctuation])
    
    for w in r.split():
    # Note = rather than +=, different versions of tf could be used instead
        tf[w] = 1
    
    tfidf = dict(zip(words,[tf[w] * math.log2(len(election_dataset) / df[w]) for w in words]))
    tfidfQuery = [tf[w] * math.log2(len(election_dataset) / df[w]) for w in words]

    maxTf = [(tf[w],w) for w in words]
    maxTf.sort(reverse=True)
    maxTfIdf = [(tfidf[w],w) for w in words]
    maxTfIdf.sort(reverse=True)
    
    tf_value = maxTfIdf[0][0]
    tf_list = []
    
    for i in range(len(maxTfIdf)):
        
        if maxTfIdf[i][0] != tf_value:
            break
        tf_list.append(maxTfIdf[i][1])
    tf_list = [ele for ele in tf_list if all(ch not in ele for ch in stopwords)]
        
    my_dict[f'document {x}'] = {'top_TfIdf':tf_list}


In [18]:
len(my_dict)

4131

In [19]:
len(election_dataset)

4131

In [21]:
# find highest TF - IDF of each document 
# groupby each TF - IDF and get the number of times each words was the highest 
# sentiment analysis of a single word 
# compare sentiment analysis of these words to documents that have TF - IDF

# Cosine Similarity 

In [22]:
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0


In [23]:
similarities = []
for rev2 in election_dataset:
    tf = defaultdict(int)
    r = ''.join([c for c in rev2['ad_creative_bodies'][0].lower() if not c in punctuation])
    for w in r.split():
        # Note = rather than +=
        tf[w] = 1
    tfidf2 = [tf[w] * math.log2(len(election_dataset) / df[w]) for w in words]
    similarities.append((Cosine(tfidfQuery, tfidf2), rev2['ad_creative_bodies']))


In [24]:
similarities.sort(reverse=True)

In [27]:
len(similarities)

4131

In [25]:
similarities[:10]

[(1.0,
  ['America, under Trump, is no longer the world leader it used to be.  Instead it\'s "America First" and the last and the only nation.  This approach isn\'t working very well with 80,000 deaths from Covid-19.  And if some other country gets the vaccine first, will they want to share with a country who thinks they\'re the First and the Last and the Only?  Maybe not.  https:\\/\\/www.theguardian.com\\/commentisfree\\/2020\\/may\\/12\\/trump-is-making-america-an-obstacle-in-the-global-fight-against-covid-19']),
 (1.0,
  ['America, under Trump, is no longer the world leader it used to be.  Instead it\'s "America First" and the last and the only nation.  This approach isn\'t working very well with 80,000 deaths from Covid-19.  And if some other country gets the vaccine first, will they want to share with a country who thinks they\'re the First and the Last and the Only?  Maybe not.  https:\\/\\/www.theguardian.com\\/commentisfree\\/2020\\/may\\/12\\/trump-is-making-america-an-obstac