In [1]:
import gzip
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE

In [2]:
import pandas as pd
import requests
import json
import pickle


In [5]:
#Reads in Ads10000_election file

f = open("Ads10000_election", "rb")
ads_election = pickle.load(f)
f.close() 

In [6]:
#Reads in Ads10000_vaccine file

g = open("Ads10000_vaccine", "rb")
ads_vaccine = pickle.load(g)
g.close() 

In [7]:
df_election = pd.DataFrame(ads_election)
df_election.head()

Unnamed: 0,ad_creation_time,ad_creative_bodies,ad_creative_link_captions,ad_creative_link_descriptions,ad_creative_link_titles,ad_delivery_start_time,ad_delivery_stop_time,estimated_audience_size,impressions,spend,id,delivery_by_region,demographic_distribution
0,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-04,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",711213506429873,,
1,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-01,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",127634835910715,,
2,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-04,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",3821823704574505,,
3,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-03,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",745901799329077,,
4,2021-02-01,[County closes downtown San Diego vaccine site...,[https://apps.apple.com/us/app/smartnews-local...,[‎SmartNews is the award-winning news app down...,[News for San Diego County],2021-02-01,2021-02-03,{'lower_bound': '1000001'},"{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",172031037686911,,


In [8]:
# drops all ads with no description and turns it back into json 

def df_to_json(df):
    # drops all NaN data
    dropped_NaN = df.dropna()
    
    # converts df to json
    json = dropped_NaN.to_json(orient='records')
    dataset = eval(json)
    return dataset

In [9]:
# lowering and removing punctuation 
# counts number of occurences of each word, stored in wordCount
def simplify(dataset,text_col):
    wordCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in dataset:
        r = ''.join([c for c in d[text_col].lower() if not c in punctuation])
        for w in r.split():
            wordCount[w] += 1
    return wordCount

In [10]:
# gets frequency of each word in the entire dataset 
def term_frequency(dataset,text_col):
    df = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in dataset:
        r = ''.join([c for c in d[text_col].lower() if not c in punctuation])
        for w in set(r.split()):
            df[w] += 1
    return df

In [11]:
# returns dictionary of term frequency inverse document frequency
def tf_idf(dataset,text_col):
    data = df_to_json(dataset)
    wordCount = simplify(data,text_col)
    
    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()
    words = [x[1] for x in counts[:]]
    
    df = term_frequency(data,text_col)
    
    punctuation = set(string.punctuation)
    my_dict = {}
    stopwords = ['http','html','www','\\']
    alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

    for x in range(len(data)):

        rev = data[x]
        tf = defaultdict(int)
        r = ''.join([c for c in rev[text_col].lower() if not c in punctuation])

        for w in r.split():
        # Note = rather than +=, different versions of tf could be used instead
            tf[w] = 1

        tfidf = dict(zip(words,[tf[w] * math.log2(len(data) / df[w]) for w in words]))
        tfidfQuery = [tf[w] * math.log2(len(data) / df[w]) for w in words]

        maxTf = [(tf[w],w) for w in words]
        maxTf.sort(reverse=True)
        maxTfIdf = [(tfidf[w],w) for w in words]
        maxTfIdf.sort(reverse=True)

        tf_value = maxTfIdf[0][0]
        tf_list = []
        final_list = []

        for i in range(len(maxTfIdf)):

            if maxTfIdf[i][0] != tf_value:
                break
            tf_list.append(maxTfIdf[i][1])

        # sets doc number to empty list if the lenght of tf_list is greater than 20
        if len(tf_list) > 20:
            my_dict[f'doc {x}'] = {'top_TfIdf':[]}
            continue

        tf_list = [ele for ele in tf_list if all(ch not in ele for ch in stopwords)]

        for i in tf_list:
            if any(elem in i  for elem in alphabet):
                final_list.append(i)

        my_dict[f'doc {x}'] = {'top_TfIdf':final_list}
    return my_dict

# Cosine Similarity 

In [None]:
def Cosine(x1,x2):
    numer = 0
    norm1 = 0
    norm2 = 0
    for a1,a2 in zip(x1,x2):
        numer += a1*a2
        norm1 += a1**2
        norm2 += a2**2
    if norm1*norm2:
        return numer / math.sqrt(norm1*norm2)
    return 0


In [None]:
# bugged 
def cosine_similarity(dataset,text_col):
    data = df_to_json(dataset)
    wordCount = simplify(data,text_col)
    counts = [(wordCount[w], w) for w in wordCount]
    counts.sort()
    counts.reverse()
    words = [x[1] for x in counts[:]]
    df = term_frequency(data,text_col)
    punctuation = set(string.punctuation)
    
    for x in range(len(data)):

        rev = data[x]
        tf = defaultdict(int)
        r = ''.join([c for c in rev[text_col].lower() if not c in punctuation])

        for w in r.split():
        # Note = rather than +=, different versions of tf could be used instead
            tf[w] = 1

        tfidf = dict(zip(words,[tf[w] * math.log2(len(data) / df[w]) for w in words]))
        tfidfQuery = [tf[w] * math.log2(len(data) / df[w]) for w in words]
    
    similarities = []
    for rev2 in data:
        tf = defaultdict(int)
        r = ''.join([c for c in rev2[text_col][0].lower() if not c in punctuation])
        for w in r.split():
            # Note = rather than +=
            tf[w] = 1
        tfidf2 = [tf[w] * math.log2(len(data) / df[w]) for w in words]
        similarities.append((Cosine(tfidfQuery, tfidf2), rev2[text_col]))
    return similarities


In [16]:
df_election = pd.read_csv('/Users/jpuray/FB_Ads/Data/Ads_election_sample.csv', index_col=0)

In [17]:
df_vaccine = pd.read_csv('/Users/jpuray/FB_Ads/Data/Ads_vaccine_sample.csv', index_col=0)

In [18]:
election_tf_idf = tf_idf(df_election,'ad_creative_bodies')

In [19]:
vaccine_tf_idf = tf_idf(df_vaccine,'ad_creative_bodies')

In [23]:
election_tf_idf

{'doc 0': {'top_TfIdf': ['“monroeville',
   'ye',
   'triblive',
   'property”',
   'multiunit',
   'lots”',
   'family\xadoriented',
   'exploration',
   'condos”',
   'applies']},
 'doc 1': {'top_TfIdf': ['bluffton', 'beaufort']},
 'doc 2': {'top_TfIdf': ['schaer']},
 'doc 3': {'top_TfIdf': ['houghtaling', 'downey']},
 'doc 4': {'top_TfIdf': ['bill4bremerton']},
 'doc 5': {'top_TfIdf': ['sangiolo', 'mah']},
 'doc 6': {'top_TfIdf': ['vulnerability',
   'staton',
   'professing',
   'browns',
   'brene']},
 'doc 7': {'top_TfIdf': ['dawn', 'addiego']},
 'doc 8': {'top_TfIdf': ['storming', 'mold']},
 'doc 9': {'top_TfIdf': []},
 'doc 10': {'top_TfIdf': ['obligations']},
 'doc 11': {'top_TfIdf': ['out—vote', 'citizenled']},
 'doc 12': {'top_TfIdf': ['iwillvotecomlocate']},
 'doc 13': {'top_TfIdf': ['tues']},
 'doc 14': {'top_TfIdf': ['noise', 'misconceptions', 'clarify']},
 'doc 15': {'top_TfIdf': ['voteteamhelen',
   'teamhelen4rihboe',
   'teamhelen',
   'studentathletes',
   'rih',
   

In [21]:
vaccine_tf_idf

{'doc 0': {'top_TfIdf': ['“vaccines”', 'lastly', 'defendingyourfreedoms']},
 'doc 1': {'top_TfIdf': ['sexualassault',
   'sexed',
   'saudiarabia',
   'manatee',
   'caseygoodson',
   'arctic']},
 'doc 2': {'top_TfIdf': ['friendlier']},
 'doc 3': {'top_TfIdf': ['plata']},
 'doc 4': {'top_TfIdf': ['kelly’s']},
 'doc 5': {'top_TfIdf': ['restraints', 'courtpacking']},
 'doc 6': {'top_TfIdf': []},
 'doc 7': {'top_TfIdf': ['withdrawn']},
 'doc 8': {'top_TfIdf': ['upbeat']},
 'doc 9': {'top_TfIdf': ['citizen’s']},
 'doc 10': {'top_TfIdf': ['katrina', 'foley']},
 'doc 11': {'top_TfIdf': ['weacorg', 'confidently']},
 'doc 12': {'top_TfIdf': ['infiltration']},
 'doc 13': {'top_TfIdf': ['matters…period', 'enough…freedom']},
 'doc 14': {'top_TfIdf': ['saisd', 'lecholop']},
 'doc 15': {'top_TfIdf': ['quick']},
 'doc 16': {'top_TfIdf': ['minexpo']},
 'doc 17': {'top_TfIdf': ['arcadia']},
 'doc 18': {'top_TfIdf': []},
 'doc 19': {'top_TfIdf': ['physics',
   'makingsenseofcovidcom',
   'kingsway',
  