In [5]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
import spacy
from spacy.matcher import PhraseMatcher

# GOOGLE + FACEBOOK

In [6]:
def get_text_from_csv(fn): # get ad's text from csv file, return a dataframe with text, ad_id,ad_url, and report_url
    df = pd.read_csv(fn)
    df = df[df['ad_type'] == 'Text'].reset_index(drop = True) # only get text ads
    df_url = df[['ad_id','ad_url']].reset_index(drop = True) # get the urls
    urls = df_url['ad_url'].to_list() # create a list so we could get the report_urls
    report_urls = []
    for url in urls:
        entity_id = url.split('/')[-3]
        creative_id = url.split('/')[-1]
        report_url = 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={}&creative_id={}&hl=en'.format(entity_id,creative_id)
        report_urls.append(report_url)
    df_report = pd.DataFrame(report_urls).rename(columns = {0:'report_url'})
    ad_text = [] # get texts from reports
    for report_url in report_urls:
        response = requests.get(report_url)
        text = response.text.split('"]')[0].split('[')[-1]
        ad_text.append(text)  
    df_text = pd.DataFrame(ad_text).rename(columns = {0:'text'})
    df_new = df[df['ad_type']=='Text'][['ad_id','ad_url']].reset_index(drop = True) 
    df = pd.concat([df_new,df_report,df_text],axis=1)
    return df 

In [7]:
google_df = get_text_from_csv('GoogleAds/CT.csv')
google_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       54 non-null     object
 1   ad_url      54 non-null     object
 2   report_url  54 non-null     object
 3   text        54 non-null     object
dtypes: object(4)
memory usage: 1.8+ KB


In [8]:
# read facebook ads
facebook_df = pd.read_csv('FacebookAds/Connecticut.csv')
# only keep text column
facebook_df.drop(columns = ['Unnamed: 0','byline'],inplace = True)
pd.set_option('display.max_colwidth', None)
facebook_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1285 entries, 0 to 1284
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1280 non-null   object
dtypes: object(1)
memory usage: 10.2+ KB


In [9]:
# cncat google_df and facebook_df by 'text'
df_text = pd.concat([google_df,facebook_df])
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR442831434822975488,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR442831434822975488,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR442831434822975488&hl=en,"""Our Democrat opponent outraised us last quarter, so we need your help immediately."",""The Georgia polls are a dead heat, so please help us close the fundraising gap today!"
1,CR217284766326587392,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR217284766326587392,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR217284766326587392&hl=en,"""Our Democrat opponent outraised us last quarter, so we need your help immediately."",""Georgia will decide the Senate Majority in 2022, so help us close the fundraising gap!"
2,CR144710676701511680,https://transparencyreport.google.com/political-ads/library/advertiser/AR97392987641741312/creative/CR144710676701511680,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR97392987641741312&creative_id=CR144710676701511680&hl=en,"""It's time to focus on the hard-working families in Connecticut and end the D.C. power grab"",""George Logan is the Republican leadership Washington and Connecticut need. Donate today!"
3,CR448694580578091008,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR448694580578091008,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR448694580578091008&hl=en,"""Our Democrat opponent outraised us last quarter, so we need your help immediately."",""Georgia will decide the Senate Majority in 2022, so help us close the fundraising gap!"
4,CR165583530565304320,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR165583530565304320,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR165583530565304320&hl=en,"""Our Democrat opponent outraised us last quarter, so we need your help immediately."",""The Georgia polls are a dead heat, so please help us close the fundraising gap today!"


In [10]:
# check the dataset 
if df_text['text'].isnull().sum():
    print (df_text['text'].isnull().sum())
    df_text = df_text[df_text['text'].notnull()]
    df_text.reset_index(drop=True,inplace=True)

5


In [11]:
# function that filter the urls and symbols in the text, and remove duplicated ads
def clean_text(df_text):
    def filter_text(x):
        url = 'http[s]?://\S+'
        x = re.sub(url,'',x)
        x = re.sub("[^\w\s]",' ',x) # filter symbols
        x = re.sub("\s+",' ',x)
        ls=[w.lower() for w in x.split()] 
        return ' '.join(ls)
    df_text['text'] = df_text['text'].astype(str).apply(lambda x: filter_text(x))
    df_text['text'] = df_text['text'].replace('/u0026', ' ')
    df_text.drop_duplicates(subset = 'text',keep = 'first', inplace = True)
    df_text.reset_index(drop = True,inplace = True)
    return df_text
    return ' '.join(ls)

In [12]:
df_text = clean_text(df_text)
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       29 non-null     object
 1   ad_url      29 non-null     object
 2   report_url  29 non-null     object
 3   text        233 non-null    object
dtypes: object(4)
memory usage: 7.4+ KB


In [13]:
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR442831434822975488,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR442831434822975488,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR442831434822975488&hl=en,our democrat opponent outraised us last quarter so we need your help immediately the georgia polls are a dead heat so please help us close the fundraising gap today
1,CR217284766326587392,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR217284766326587392,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR217284766326587392&hl=en,our democrat opponent outraised us last quarter so we need your help immediately georgia will decide the senate majority in 2022 so help us close the fundraising gap
2,CR144710676701511680,https://transparencyreport.google.com/political-ads/library/advertiser/AR97392987641741312/creative/CR144710676701511680,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR97392987641741312&creative_id=CR144710676701511680&hl=en,it s time to focus on the hard working families in connecticut and end the d c power grab george logan is the republican leadership washington and connecticut need donate today
3,CR402139334112706560,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR402139334112706560,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR402139334112706560&hl=en,help herschel walker reach his critical end of quarter fundraising goal donate to the official website of herschel walker for united states senate today
4,CR22875717373001728,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR22875717373001728,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR22875717373001728&hl=en,georgia will decide the senate majority in 2022 so help us close the fundraising gap our democrat opponent outraised us last quarter so we need your help immediately


In [14]:
# import the Lexicon
with open ('lexicon.json') as f1:
    dic1 = json.load(f1)
# check the keys
dic1.keys()

dict_keys(['covid', 'economic', 'education', 'environment', 'foreign policy', 'governance', 'health', 'immigration', 'military', 'safety', 'social and cultural', 'social programs', 'donation'])

In [15]:
# import en_core_web
#This only happens to Ying that she couldn't import 'en_core_web_sm' in Jupyter Notebook, so she imports it with the full path
nlp = spacy.load("/usr/local/Cellar/jupyterlab/3.2.9/libexec/lib/python3.9/site-packages/en_core_web_lg/en_core_web_lg-3.2.0/")

In [16]:
# the function that find the lexicon words in the text
def find_words(x,lexicon):
    topics= lexicon.keys()  
    doc = nlp(x) # nlp() is spaCy 2.2 English language model 
    words= []
    for t in topics:
        matcher = PhraseMatcher(nlp.vocab)
        terms= lexicon[t]
        patterns = [nlp.make_doc(text) for text in terms]
        matcher.add("TerminologyList", patterns) # spaCy2.2 phrase matcher
        matches = matcher(doc)
        for match_id, start,end in matches:
            span = doc[start:end]
            words.append(span.text)
    if words:
        words = list(set(words))
        return ','.join(words)
    else:
        return('no words')

In [17]:
# tagging the topic in each message
def find_topic(x,lexicon):
    topics= lexicon.keys()    
    if x=='no words':
        return ''    
    if x != 'no words': 
        words = x.split(',')
        labels = []        
        for t in topics:            
            terms = lexicon[t]
            if set(words)&set(terms):
                labels.append(t)                
                #l = sorted(labels)        
        return  ','.join(sorted(labels))
                
        #return ','.join(labels)
            

In [18]:
def get_word_lable(df_text):
    df_text['words'] = df_text['text'].astype(str).apply(lambda x: find_words(x,dic1))
    df_text['m_label'] = df_text['words'].apply(lambda x: find_topic(x,dic1))
    df_text['m_label'] = df_text['m_label'].apply(lambda x: 'no topic' if x=='' else x)
    return df_text

In [19]:
get_word_lable(df_text)

Unnamed: 0,ad_id,ad_url,report_url,text,words,m_label
0,CR442831434822975488,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR442831434822975488,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR442831434822975488&hl=en,our democrat opponent outraised us last quarter so we need your help immediately the georgia polls are a dead heat so please help us close the fundraising gap today,no words,no topic
1,CR217284766326587392,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR217284766326587392,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR217284766326587392&hl=en,our democrat opponent outraised us last quarter so we need your help immediately georgia will decide the senate majority in 2022 so help us close the fundraising gap,no words,no topic
2,CR144710676701511680,https://transparencyreport.google.com/political-ads/library/advertiser/AR97392987641741312/creative/CR144710676701511680,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR97392987641741312&creative_id=CR144710676701511680&hl=en,it s time to focus on the hard working families in connecticut and end the d c power grab george logan is the republican leadership washington and connecticut need donate today,donate,donation
3,CR402139334112706560,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR402139334112706560,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR402139334112706560&hl=en,help herschel walker reach his critical end of quarter fundraising goal donate to the official website of herschel walker for united states senate today,donate,donation
4,CR22875717373001728,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR22875717373001728,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR22875717373001728&hl=en,georgia will decide the senate majority in 2022 so help us close the fundraising gap our democrat opponent outraised us last quarter so we need your help immediately,no words,no topic
...,...,...,...,...,...,...
228,,,,drugs flood through the border killing our families and neighbors sign to save our families,"drugs,border","immigration,safety"
229,,,,last tuesday i had the privilege of appearing on fox 94 9 lee elci the voice of freedom here s a video in case you missed it,no words,no topic
230,,,,this is it the biggest test of my career in public service tonight we wrap up our fundraising period democrats across the state will use this information when deciding who to support at the critical state convention will our fundraising finally convince the establishment that this campaign is for real you can help answer that question by contributing now hilda,no words,no topic
231,,,,ctteacher laura baker coronis got an unexpected surprise today when milken educator awards senior vice president jane foley called her name as this season s ct milken educator award recipient the ansonia high school community went wild to celebrate the math teacher ansonia public schools connecticut state department of education office of lt governor susan bysiewicz educatect ansonia ansoniact goodnews,"education,public schools,state department,school","education,foreign policy"


In [20]:
df_text.to_csv('ad_ny.csv')

In [21]:
def count_topic(df_text):
    df_tag = df_text['m_label'].value_counts().rename_axis('topics').reset_index(name='counts')
    df_tag = df_tag.assign(single_topic=df_tag['topics'].str.split(',')).explode('single_topic').reset_index(drop = True)
    df_tag = df_tag.groupby('single_topic').sum().reset_index().sort_values(by = 'counts', ascending = False)
    df_tag.reset_index(drop = True, inplace = True)
    return df_tag

In [22]:
text_topic = count_topic(df_text)
text_topic

Unnamed: 0,single_topic,counts
0,no topic,109
1,economic,40
2,education,30
3,health,25
4,social programs,17
5,governance,15
6,social and cultural,13
7,environment,11
8,safety,11
9,covid,8


In [23]:
def count_word(df_text):
    df_words = df_text['words'].value_counts().rename_axis('words').reset_index(name='counts')
    df_words = df_words.assign(single_word=df_words['words'].str.split(',')).explode('single_word').reset_index(drop = True)
    df_words = df_words.groupby('single_word').sum().reset_index().sort_values(by = 'counts', ascending = False)
    df_words.reset_index(drop = True, inplace = True)
    return df_words

In [24]:
text_word = count_word(df_text)
text_word

Unnamed: 0,single_word,counts
0,no words,109
1,education,17
2,health,15
3,tax,14
4,public schools,13
...,...,...
117,deductibles,1
118,natural gas,1
119,deficit,1
120,medicine,1


# GOOGLE

In [25]:
if google_df['text'].isnull().sum():
    print (google_df['text'].isnull().sum())
    google_df = google_df[google_df['text'].notnull()]
    google_df.reset_index(drop=True,inplace=True)

In [26]:
google_df = clean_text(google_df)
google_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       29 non-null     object
 1   ad_url      29 non-null     object
 2   report_url  29 non-null     object
 3   text        29 non-null     object
dtypes: object(4)
memory usage: 1.0+ KB


In [27]:
google_df = get_word_lable(google_df)
google_df.head()

Unnamed: 0,ad_id,ad_url,report_url,text,words,m_label
0,CR442831434822975488,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR442831434822975488,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR442831434822975488&hl=en,our democrat opponent outraised us last quarter so we need your help immediately the georgia polls are a dead heat so please help us close the fundraising gap today,no words,no topic
1,CR217284766326587392,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR217284766326587392,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR217284766326587392&hl=en,our democrat opponent outraised us last quarter so we need your help immediately georgia will decide the senate majority in 2022 so help us close the fundraising gap,no words,no topic
2,CR144710676701511680,https://transparencyreport.google.com/political-ads/library/advertiser/AR97392987641741312/creative/CR144710676701511680,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR97392987641741312&creative_id=CR144710676701511680&hl=en,it s time to focus on the hard working families in connecticut and end the d c power grab george logan is the republican leadership washington and connecticut need donate today,donate,donation
3,CR402139334112706560,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR402139334112706560,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR402139334112706560&hl=en,help herschel walker reach his critical end of quarter fundraising goal donate to the official website of herschel walker for united states senate today,donate,donation
4,CR22875717373001728,https://transparencyreport.google.com/political-ads/library/advertiser/AR409835984226615296/creative/CR22875717373001728,https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR409835984226615296&creative_id=CR22875717373001728&hl=en,georgia will decide the senate majority in 2022 so help us close the fundraising gap our democrat opponent outraised us last quarter so we need your help immediately,no words,no topic


In [28]:
google_topic = count_topic(google_df)
google_topic

Unnamed: 0,single_topic,counts
0,no topic,18
1,donation,3
2,governance,3
3,economic,2
4,social programs,2
5,covid,1
6,immigration,1


In [29]:
google_word = count_word(google_df)
google_word

Unnamed: 0,single_word,counts
0,no words,18
1,donate,3
2,filibuster,3
3,taxes,2
4,border,1
5,covid,1
6,jobs,1
7,medicine,1
8,out of pocket,1
9,vaccines,1


# FACEBOOK 

In [30]:
if facebook_df['text'].isnull().sum():
    print (facebook_df['text'].isnull().sum())
    facebook_df = facebook_df[facebook_df['text'].notnull()]
    facebook_df.reset_index(drop=True,inplace=True)

5


In [31]:
facebook_df = clean_text(facebook_df)
facebook_df.info

<bound method DataFrame.info of                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 text
0                                                                                                                                                                                                                                                                                                                                                                  

In [32]:
facebook_df = get_word_lable(facebook_df)
facebook_df.head()

Unnamed: 0,text,words,m_label
0,please join cea s red for ed day of action this wednesday come to hartford to meet with your legislators or if you can t make it take just a few minutes to call or email,no words,no topic
1,thinking about attending a connecticut state community college get all the information and support you need during our virtual showcases april 28 30th hear about community college and program options throughout the state transfer opportunities financial aid careers and more all from your couch live chat will be available register today by visiting the link in our bio,"community college,college",social programs
2,governor lamont is protecting abortion rights in connecticut the governor is set to sign a major piece of legislation protecting abortion ahead of an upcoming supreme court ruling sign up now to learn more about what the governor is doing for connecticut,"abortion,supreme court","governance,health,social and cultural"
3,hey connecticut have you heard governor lamont has signed a bill suspending the gas tax this means real relief for working families across the state add your name now to get more updates on what gov lamont is doing for connecticut,tax,economic
4,cost increases are hitting families in connecticut that s why governor lamont and the legislature have suspended the gas tax it ll cost you less to fill up at the pump that s real relief for connecticut families add your name and find out more about how the governor is working for you,tax,economic


In [33]:
facebook_topic = count_topic(facebook_df)
facebook_topic

Unnamed: 0,single_topic,counts
0,no topic,91
1,economic,38
2,education,30
3,health,25
4,social programs,15
5,social and cultural,13
6,governance,12
7,environment,11
8,safety,11
9,foreign policy,8


In [34]:
facebook_word = count_word(facebook_df)
facebook_word

Unnamed: 0,single_word,counts
0,no words,91
1,education,17
2,health,15
3,tax,14
4,public schools,13
...,...,...
113,commandments,1
114,energy independence,1
115,racist,1
116,reproductive rights,1
