In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
import spacy
from spacy.matcher import PhraseMatcher

In [3]:
def get_text_from_csv(fn): # get ad's text from csv file, return a dataframe with text, ad_id,ad_url, and report_url
    df = pd.read_csv(fn)
    df = df[df['ad_type'] == 'Text'].reset_index(drop = True) # only get text ads
    df_url = df[['ad_id','ad_url']].reset_index(drop = True) # get the urls
    urls = df_url['ad_url'].to_list() # create a list so we could get the report_urls
    report_urls = []
    for url in urls:
        entity_id = url.split('/')[-3]
        creative_id = url.split('/')[-1]
        report_url = 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={}&creative_id={}&hl=en'.format(entity_id,creative_id)
        report_urls.append(report_url)
    df_report = pd.DataFrame(report_urls).rename(columns = {0:'report_url'})
    ad_text = [] # get texts from reports
    for report_url in report_urls:
        response = requests.get(report_url)
        text = response.text.split('"]')[0].split('[')[-1]
        ad_text.append(text)  
    df_text = pd.DataFrame(ad_text).rename(columns = {0:'text'})
    df_new = df[df['ad_type']=='Text'][['ad_id','ad_url']].reset_index(drop = True) 
    df = pd.concat([df_new,df_report,df_text],axis=1)
    return df 

In [5]:
df_text = get_text_from_csv('GoogleAds/CT.csv')
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       54 non-null     object
 1   ad_url      54 non-null     object
 2   report_url  54 non-null     object
 3   text        54 non-null     object
dtypes: object(4)
memory usage: 1.8+ KB


In [6]:
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR442831434822975488,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Our Democrat opponent outraised us last quart..."
1,CR217284766326587392,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Our Democrat opponent outraised us last quart..."
2,CR144710676701511680,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""It's time to focus on the hard-working famili..."
3,CR448694580578091008,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Our Democrat opponent outraised us last quart..."
4,CR165583530565304320,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Georgia will decide the Senate Majority in 20..."


In [7]:
# check the dataset 
if df_text['text'].isnull().sum():
    print (df_text['text'].isnull().sum())
    df_text = df_text[df_text['text'].notnull()]
    df_text.reset_index(drop=True,inplace=True)

In [8]:
# function that filter the urls and symbols in the text 
def filter_text(x):
    url = 'http[s]?://\S+'
    x = re.sub(url,'',x)
    x = re.sub("[^\w\s]",' ',x) # filter symbols
    x = re.sub("\s+",' ',x)
    
    ls=[w.lower() for w in x.split()] 
    
    return ' '.join(ls)

In [9]:
df_text['text'] = df_text['text'].astype(str).apply(lambda x: filter_text(x))
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR442831434822975488,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,our democrat opponent outraised us last quarte...
1,CR217284766326587392,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,our democrat opponent outraised us last quarte...
2,CR144710676701511680,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,it s time to focus on the hard working familie...
3,CR448694580578091008,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,our democrat opponent outraised us last quarte...
4,CR165583530565304320,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,georgia will decide the senate majority in 202...


In [10]:
# remove duplicated rows
df_text.drop_duplicates(subset = 'text',keep = 'first',inplace = True)
df_text.reset_index(inplace = True,drop = True)
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR442831434822975488,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,our democrat opponent outraised us last quarte...
1,CR217284766326587392,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,our democrat opponent outraised us last quarte...
2,CR144710676701511680,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,it s time to focus on the hard working familie...
3,CR165583530565304320,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,georgia will decide the senate majority in 202...
4,CR402139334112706560,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,help herschel walker reach his critical end of...


In [11]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       27 non-null     object
 1   ad_url      27 non-null     object
 2   report_url  27 non-null     object
 3   text        27 non-null     object
dtypes: object(4)
memory usage: 992.0+ bytes


In [12]:
df_text.to_csv('ct_ad.csv')

In [13]:
# import the Lexicon
with open ('lexicon.json') as f1:
    dic1 = json.load(f1)
# check the keys
dic1.keys()

dict_keys(['covid', 'economic', 'education', 'environment', 'foreign policy', 'governance', 'health', 'immigration', 'military', 'safety', 'social and cultural', 'social programs', 'donation'])

In [14]:
# import en_core_web
#This only happens to Ying that she couldn't import 'en_core_web_sm' in Jupyter Notebook, so she imports it with the full path
nlp = spacy.load("/usr/local/Cellar/jupyterlab/3.2.9/libexec/lib/python3.9/site-packages/en_core_web_lg/en_core_web_lg-3.2.0/")

In [15]:
# the function that find the lexicon words in the text
def find_words(x,lexicon):
    topics= lexicon.keys()  
    doc = nlp(x) # nlp() is spaCy 2.2 English language model 
    words= []
    for t in topics:
        matcher = PhraseMatcher(nlp.vocab)
        terms= lexicon[t]
        patterns = [nlp.make_doc(text) for text in terms]
        matcher.add("TerminologyList", patterns) # spaCy2.2 phrase matcher
        matches = matcher(doc)
        for match_id, start,end in matches:
            span = doc[start:end]
            words.append(span.text)
    if words:
        words = list(set(words))
        return ','.join(words)
    else:
        return('no words')

In [16]:
# tagging the topic in each message
def find_topic(x,lexicon):
    topics= lexicon.keys()    
    if x=='no words':
        return ''    
    if x != 'no words': 
        words = x.split(',')
        labels = []        
        for t in topics:            
            terms = lexicon[t]
            if set(words)&set(terms):
                labels.append(t)                
                #l = sorted(labels)        
        return  ','.join(sorted(labels))
                
        #return ','.join(labels)
            

In [17]:
df_text['words'] = df_text['text'].astype(str).apply(lambda x: find_words(x,dic1))
df_text['m_label'] = df_text['words'].apply(lambda x: find_topic(x,dic1))
df_text['m_label'] = df_text['m_label'].apply(lambda x: 'no topic' if x=='' else x)
df_text

Unnamed: 0,ad_id,ad_url,report_url,text,words,m_label
0,CR442831434822975488,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,our democrat opponent outraised us last quarte...,no words,no topic
1,CR217284766326587392,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,our democrat opponent outraised us last quarte...,no words,no topic
2,CR144710676701511680,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,it s time to focus on the hard working familie...,donate,donation
3,CR165583530565304320,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,georgia will decide the senate majority in 202...,no words,no topic
4,CR402139334112706560,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,help herschel walker reach his critical end of...,donate,donation
5,CR75909561227149312,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,we must prioritize lowering barriers between m...,savings,economic
6,CR441597782776610816,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,gov lamont is touting a proposal that would th...,no words,no topic
7,CR26022519651696640,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,the january 6 insurrection kicked off a concer...,no words,no topic
8,CR512656470500704256,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,leora is running for senate to retire dick blu...,no words,no topic
9,CR523746144778452992,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,the january 6 insurrection kicked off a concer...,no words,no topic


In [21]:
def count_tag(df_text):
    df_tag = df_text['m_label'].value_counts().rename_axis('topics').reset_index(name='counts')
    df_tag = df_tag.assign(single_topic=df_tag['topics'].str.split(',')).explode('single_topic').reset_index(drop = True)
    df_tag = df_tag.groupby('single_topic').sum().reset_index().sort_values(by = 'counts', ascending = False)
    return df_tag

In [22]:
count_tag(df_text)

Unnamed: 0,single_topic,counts
5,no topic,17
1,donation,3
2,economic,3
3,governance,2
0,covid,1
4,immigration,1
6,social programs,1


In [26]:
def count_word(df_text):
    df_words = df_text['words'].value_counts().rename_axis('words').reset_index(name='counts')
    df_words = df_words.assign(single_word=df_words['words'].str.split(',')).explode('single_word').reset_index(drop = True)
    df_words = df_words.groupby('single_word').sum().reset_index().sort_values(by = 'counts', ascending = False)
    df_words.reset_index(drop = True, inplace = True)
    return df_words

In [27]:
count_word(df_text)

Unnamed: 0,single_word,counts
0,no words,17
1,donate,3
2,filibuster,2
3,taxes,2
4,border,1
5,covid,1
6,jobs,1
7,savings,1
8,vaccines,1
