In [15]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
import spacy
from spacy.matcher import PhraseMatcher

In [16]:
def get_text_from_csv(fn): # get ad's text from csv file, return a dataframe with text, ad_id,ad_url, and report_url
    df = pd.read_csv(fn)
    df = df[df['ad_type'] == 'Text'].reset_index(drop = True) # only get text ads
    df_url = df[['ad_id','ad_url']].reset_index(drop = True) # get the urls
    urls = df_url['ad_url'].to_list() # create a list so we could get the report_urls
    report_urls = []
    for url in urls:
        entity_id = url.split('/')[-3]
        creative_id = url.split('/')[-1]
        report_url = 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={}&creative_id={}&hl=en'.format(entity_id,creative_id)
        report_urls.append(report_url)
    df_report = pd.DataFrame(report_urls).rename(columns = {0:'report_url'})
    ad_text = [] # get texts from reports
    for report_url in report_urls:
        response = requests.get(report_url)
        text = response.text.split('"]')[0].split('[')[-1]
        ad_text.append(text)  
    df_text = pd.DataFrame(ad_text).rename(columns = {0:'text'})
    df_new = df[df['ad_type']=='Text'][['ad_id','ad_url']].reset_index(drop = True) 
    df = pd.concat([df_new,df_report,df_text],axis=1)
    return df  

In [18]:
# read google ads
google_df = get_text_from_csv('GoogleAds/CA.csv') 
google_df.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR221162606398799872,https://transparencyreport.google.com/political-ads/library/advertiser/AR206807382586556416/creative/CR221162606398799872,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR206807382586556416&creative_id=CR221162606398799872&hl=en,"""Ad Paid for by Dave Myers for San Diego County Sheriff 2022 | FPPC ID # 1436219"",""Don't Settle for the Status Quo! Elect Dave Myers for Sheriff come June 7th, 2022!"
1,CR436752647349862400,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR436752647349862400,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR436752647349862400&hl=en,"""A Broad Coalition of CA Indian Tribes Opposes This Online Gambling Prop. Learn More."",""Stop The Deceptive Corp Prop: Would Turn Every Screen Into A Gambling Device. Learn More."
2,CR149929646081703936,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR149929646081703936,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR149929646081703936&hl=en,"""Stop The Deceptive Corp Prop: Would Turn Every Screen Into A Gambling Device. Learn More."",""Take Action: Corp Prop Would Legalize An Expansion of Online Sports Gambling in CA."
3,CR548953410917892096,https://transparencyreport.google.com/political-ads/library/advertiser/AR199061254449397760/creative/CR548953410917892096,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR199061254449397760&creative_id=CR548953410917892096&hl=en,"""Stand with Captain Sam Brown. Let's restore accountability and leadership to Washington."",""Captain Sam Brown fought for our Country. Now, he is ready to serve the people of Nevada."
4,CR370192611450814464,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR370192611450814464,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR370192611450814464&hl=en,"""A Broad Coalition of CA Indian Tribes Opposes This Online Gambling Prop. Learn More."",""Take Action: Corp Prop Would Legalize An Expansion of Online Sports Gambling in CA."


In [19]:
# read facebook ads
facebook_df = pd.read_csv('FacebookAds/california.csv')

In [20]:
# only keep text column
facebook_df.drop(columns = ['Unnamed: 0','byline'],inplace = True)
pd.set_option('display.max_colwidth', None)
facebook_df

Unnamed: 0,text
0,It is awesome to have the support of the California Nurses Association! What they have been through in this pandemic is unreal… If hospitals are going to continue not paying property taxes they must treat our nurses better!
1,"When the other candidate rides the coat tails of other establishment politicians, I ride in on my experience, dedication and support from the people…. Transparency and Credibility #ad5 #california"
2,"Seydi has worked with San Fernando Valley communities for 20 years. She understands the issues we are facing and she's prepared to advocate for us in Sacramento. \n\nVote for Seydi because she pledges toward fixing:\n\n📌 Homelessness\n📌 Housing\n📌 Education\n📌 Small Business\n📌 Access to healthcare\n\nMost importantly, she pledges to listen to her constituents. Vote for Seydi Alejandra Morales for State Senate District 20 starting May 9th!"
3,Learn more about our efforts to find solutions that will protect our planet while building a more equitable future for CA families.
4,Special interests like 𝗥𝗲𝗰𝗼𝗹𝗼𝗴𝘆 are exploiting environmental problems for their own profits. \n\nThe San Francisco-based garbage and recycling giant was fined $36 million in a public corruption scandal after it admitted to bribery allegations related to overcharging customers $100 million. Don’t let special interests make up for lost profits by raising taxes on California taxpayers.
...,...
10043,Help Protect Atkins Place!
10044,Help Protect Atkins Place!
10045,Help Protect Atkins Place!
10046,Help Protect Atkins Place!


In [22]:
# cncat google_df and facebook_df by 'text'
df_text = pd.concat([google_df,facebook_df])
df_text

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR221162606398799872,https://transparencyreport.google.com/political-ads/library/advertiser/AR206807382586556416/creative/CR221162606398799872,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR206807382586556416&creative_id=CR221162606398799872&hl=en,"""Ad Paid for by Dave Myers for San Diego County Sheriff 2022 | FPPC ID # 1436219"",""Don't Settle for the Status Quo! Elect Dave Myers for Sheriff come June 7th, 2022!"
1,CR436752647349862400,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR436752647349862400,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR436752647349862400&hl=en,"""A Broad Coalition of CA Indian Tribes Opposes This Online Gambling Prop. Learn More."",""Stop The Deceptive Corp Prop: Would Turn Every Screen Into A Gambling Device. Learn More."
2,CR149929646081703936,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR149929646081703936,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR149929646081703936&hl=en,"""Stop The Deceptive Corp Prop: Would Turn Every Screen Into A Gambling Device. Learn More."",""Take Action: Corp Prop Would Legalize An Expansion of Online Sports Gambling in CA."
3,CR548953410917892096,https://transparencyreport.google.com/political-ads/library/advertiser/AR199061254449397760/creative/CR548953410917892096,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR199061254449397760&creative_id=CR548953410917892096&hl=en,"""Stand with Captain Sam Brown. Let's restore accountability and leadership to Washington."",""Captain Sam Brown fought for our Country. Now, he is ready to serve the people of Nevada."
4,CR370192611450814464,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR370192611450814464,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR370192611450814464&hl=en,"""A Broad Coalition of CA Indian Tribes Opposes This Online Gambling Prop. Learn More."",""Take Action: Corp Prop Would Legalize An Expansion of Online Sports Gambling in CA."
...,...,...,...,...
10043,,,,Help Protect Atkins Place!
10044,,,,Help Protect Atkins Place!
10045,,,,Help Protect Atkins Place!
10046,,,,Help Protect Atkins Place!


In [23]:
# check the dataset 
if df_text['text'].isnull().sum():
    print (df_text['text'].isnull().sum())
    df_text = df_text[df_text['text'].notnull()]
    df_text.reset_index(drop=True,inplace=True)

2


In [24]:
# function that filter the urls and symbols in the text 
def filter_text(x):
    url = 'http[s]?://\S+'
    x = re.sub(url,'',x)
    x = re.sub("[^\w\s]",' ',x) # filter symbols
    x = re.sub("\s+",' ',x)
    
    ls=[w.lower() for w in x.split()] 
    
    return ' '.join(ls)

In [25]:
df_text['text'] = df_text['text'].astype(str).apply(lambda x: filter_text(x))
df_text.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['text'] = df_text['text'].astype(str).apply(lambda x: filter_text(x))


Unnamed: 0,ad_id,ad_url,report_url,text
0,CR221162606398799872,https://transparencyreport.google.com/political-ads/library/advertiser/AR206807382586556416/creative/CR221162606398799872,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR206807382586556416&creative_id=CR221162606398799872&hl=en,ad paid for by dave myers for san diego county sheriff 2022 fppc id 1436219 don t settle for the status quo elect dave myers for sheriff come june 7th 2022
1,CR436752647349862400,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR436752647349862400,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR436752647349862400&hl=en,a broad coalition of ca indian tribes opposes this online gambling prop learn more stop the deceptive corp prop would turn every screen into a gambling device learn more
2,CR149929646081703936,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR149929646081703936,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR149929646081703936&hl=en,stop the deceptive corp prop would turn every screen into a gambling device learn more take action corp prop would legalize an expansion of online sports gambling in ca
3,CR548953410917892096,https://transparencyreport.google.com/political-ads/library/advertiser/AR199061254449397760/creative/CR548953410917892096,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR199061254449397760&creative_id=CR548953410917892096&hl=en,stand with captain sam brown let s restore accountability and leadership to washington captain sam brown fought for our country now he is ready to serve the people of nevada
4,CR370192611450814464,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR370192611450814464,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR370192611450814464&hl=en,a broad coalition of ca indian tribes opposes this online gambling prop learn more take action corp prop would legalize an expansion of online sports gambling in ca


In [26]:
# check if there are duplicated texts, if yes, print them out
df_text['text'].value_counts().head(20)

help protect atkins place                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [27]:
# remove duplicated rows
df_text.drop_duplicates(subset = 'text',keep = 'first',inplace = True)
df_text.reset_index(inplace = True,drop = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text.drop_duplicates(subset = 'text',keep = 'first',inplace = True)


In [28]:
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR221162606398799872,https://transparencyreport.google.com/political-ads/library/advertiser/AR206807382586556416/creative/CR221162606398799872,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR206807382586556416&creative_id=CR221162606398799872&hl=en,ad paid for by dave myers for san diego county sheriff 2022 fppc id 1436219 don t settle for the status quo elect dave myers for sheriff come june 7th 2022
1,CR436752647349862400,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR436752647349862400,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR436752647349862400&hl=en,a broad coalition of ca indian tribes opposes this online gambling prop learn more stop the deceptive corp prop would turn every screen into a gambling device learn more
2,CR149929646081703936,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR149929646081703936,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR149929646081703936&hl=en,stop the deceptive corp prop would turn every screen into a gambling device learn more take action corp prop would legalize an expansion of online sports gambling in ca
3,CR548953410917892096,https://transparencyreport.google.com/political-ads/library/advertiser/AR199061254449397760/creative/CR548953410917892096,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR199061254449397760&creative_id=CR548953410917892096&hl=en,stand with captain sam brown let s restore accountability and leadership to washington captain sam brown fought for our country now he is ready to serve the people of nevada
4,CR370192611450814464,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR370192611450814464,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR370192611450814464&hl=en,a broad coalition of ca indian tribes opposes this online gambling prop learn more take action corp prop would legalize an expansion of online sports gambling in ca


In [39]:
df_text.to_csv('ad_ca.csv')

In [29]:
# import the Lexicon
with open ('lexicon.json') as f1:
    dic1 = json.load(f1)
# check the keys
dic1.keys()

dict_keys(['covid', 'economic', 'education', 'environment', 'foreign policy', 'governance', 'health', 'immigration', 'military', 'safety', 'social and cultural', 'social programs', 'donation'])

In [30]:
# import en_core_web
#This only happens to Ying that she couldn't import 'en_core_web_sm' in Jupyter Notebook, so she imports it with the full path
nlp = spacy.load("/usr/local/Cellar/jupyterlab/3.2.9/libexec/lib/python3.9/site-packages/en_core_web_lg/en_core_web_lg-3.2.0/")

In [31]:
# the function that find the lexicon words in the text
def find_words(x,lexicon):
    topics= lexicon.keys()  
    doc = nlp(x) # nlp() is spaCy 2.2 English language model 
    words= []
    for t in topics:
        matcher = PhraseMatcher(nlp.vocab)
        terms= lexicon[t]
        patterns = [nlp.make_doc(text) for text in terms]
        matcher.add("TerminologyList", patterns) # spaCy2.2 phrase matcher
        matches = matcher(doc)
        for match_id, start,end in matches:
            span = doc[start:end]
            words.append(span.text)
    if words:
        words = list(set(words))
        return ','.join(words)
    else:
        return('no words')

In [32]:
# tagging the topic in each message
def find_topic(x,lexicon):
    topics= lexicon.keys()    
    if x=='no words':
        return ''    
    if x != 'no words': 
        words = x.split(',')
        labels = []        
        for t in topics:            
            terms = lexicon[t]
            if set(words)&set(terms):
                labels.append(t)                
                #l = sorted(labels)        
        return  ','.join(sorted(labels))
                
        #return ','.join(labels)
            

In [33]:
df_text['words'] = df_text['text'].astype(str).apply(lambda x: find_words(x,dic1))
df_text['m_label'] = df_text['words'].apply(lambda x: find_topic(x,dic1))
df_text['m_label'] = df_text['m_label'].apply(lambda x: 'no topic' if x=='' else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['words'] = df_text['text'].astype(str).apply(lambda x: find_words(x,dic1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['m_label'] = df_text['words'].apply(lambda x: find_topic(x,dic1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['m_label'] = df_text['m_label'].apply(la

In [113]:
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text,words,m_label
0,CR221162606398799872,https://transparencyreport.google.com/political-ads/library/advertiser/AR206807382586556416/creative/CR221162606398799872,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR206807382586556416&creative_id=CR221162606398799872&hl=en,ad paid for by dave myers for san diego county sheriff 2022 fppc id 1436219 don t settle for the status quo elect dave myers for sheriff come june 7th 2022,no words,no topic
1,CR436752647349862400,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR436752647349862400,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR436752647349862400&hl=en,a broad coalition of ca indian tribes opposes this online gambling prop learn more stop the deceptive corp prop would turn every screen into a gambling device learn more,no words,no topic
2,CR149929646081703936,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR149929646081703936,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR149929646081703936&hl=en,take action corp prop would legalize an expansion of online sports gambling in ca a broad coalition of ca indian tribes opposes the corp online gambling prop learn more,no words,no topic
3,CR548953410917892096,https://transparencyreport.google.com/political-ads/library/advertiser/AR199061254449397760/creative/CR548953410917892096,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR199061254449397760&creative_id=CR548953410917892096&hl=en,stand with captain sam brown let s restore accountability and leadership to washington captain sam brown fought for our country now he is ready to serve the people of nevada,no words,no topic
4,CR370192611450814464,https://transparencyreport.google.com/political-ads/library/advertiser/AR440853413404606464/creative/CR370192611450814464,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR440853413404606464&creative_id=CR370192611450814464&hl=en,a broad coalition of ca indian tribes opposes this online gambling prop learn more take action corp prop would legalize an expansion of online sports gambling in ca,no words,no topic
...,...,...,...,...,...,...
64,CR178896692232323072,https://transparencyreport.google.com/political-ads/library/advertiser/AR247685438200872960/creative/CR178896692232323072,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR247685438200872960&creative_id=CR178896692232323072&hl=en,thanks to governor newsom california is leading in vaccines and its economy is strong chip in today to re elect gavin newsom and keep california blue,"economy,vaccines,chip in","donation,economic,social programs"
65,CR26022519651696640,https://transparencyreport.google.com/political-ads/library/advertiser/AR122301255337902080/creative/CR26022519651696640,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR122301255337902080&creative_id=CR26022519651696640&hl=en,the january 6 insurrection kicked off a concerted wave of anti voting laws nationwide there s no greater priority than defending democracy you can help,no words,no topic
66,CR304499128153407488,https://transparencyreport.google.com/political-ads/library/advertiser/AR62007233126334464/creative/CR304499128153407488,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR62007233126334464&creative_id=CR304499128153407488&hl=en,save california chip in here take back the majority take back america,chip in,donation
67,CR14392847725756416,https://transparencyreport.google.com/political-ads/library/advertiser/AR512670214396051456/creative/CR14392847725756416,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR512670214396051456&creative_id=CR14392847725756416&hl=en,he has made economic and environmental justice a priority for all communities david has been representing district 8 in the san diego city council since 2010,economic,economic


In [35]:
def count_tag(df_text):
    df_tag = df_text['m_label'].value_counts().rename_axis('topics').reset_index(name='counts')
    df_tag = df_tag.assign(single_topic=df_tag['topics'].str.split(',')).explode('single_topic').reset_index(drop = True)
    df_tag = df_tag.groupby('single_topic').sum().reset_index().sort_values(by = 'counts', ascending = False)
    return df_tag

In [36]:
count_tag(df_text)

Unnamed: 0,single_topic,counts
10,no topic,138
2,economic,58
4,environment,36
1,donation,31
7,health,26
11,safety,19
13,social programs,18
12,social and cultural,17
3,education,15
0,covid,12


In [37]:
def count_word(df_text):
    df_words = df_text['words'].value_counts().rename_axis('words').reset_index(name='counts')
    df_words = df_words.assign(single_word=df_words['words'].str.split(',')).explode('single_word').reset_index(drop = True)
    df_words = df_words.groupby('single_word').sum().reset_index().sort_values(by = 'counts', ascending = False)
    df_words.reset_index(drop = True, inplace = True)
    return df_words

In [38]:
count_word(df_text)

Unnamed: 0,single_word,counts
0,no words,138
1,health,20
2,donate,18
3,tax,13
4,chip in,12
...,...,...
166,greenhouse gas,1
167,greenhouse,1
168,genocide,1
169,gas emissions,1
