In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
import spacy
from spacy.matcher import PhraseMatcher

In [2]:
def get_text_from_csv(fn): # get ad's text from csv file, return a dataframe with text, ad_id,ad_url, and report_url
    df = pd.read_csv(fn)
    df = df[df['ad_type'] == 'Text'].reset_index(drop = True) # only get text ads
    df_url = df[['ad_id','ad_url']].reset_index(drop = True) # get the urls
    urls = df_url['ad_url'].to_list() # create a list so we could get the report_urls
    report_urls = []
    for url in urls:
        entity_id = url.split('/')[-3]
        creative_id = url.split('/')[-1]
        report_url = 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={}&creative_id={}&hl=en'.format(entity_id,creative_id)
        report_urls.append(report_url)
    df_report = pd.DataFrame(report_urls).rename(columns = {0:'report_url'})
    ad_text = [] # get texts from reports
    for report_url in report_urls:
        response = requests.get(report_url)
        text = response.text.split('"]')[0].split('[')[-1]
        ad_text.append(text)  
    df_text = pd.DataFrame(ad_text).rename(columns = {0:'text'})
    df_new = df[df['ad_type']=='Text'][['ad_id','ad_url']].reset_index(drop = True) 
    df = pd.concat([df_new,df_report,df_text],axis=1)
    return df 

In [3]:
df_text = get_text_from_csv('GoogleAds/texas.csv')
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       148 non-null    object
 1   ad_url      148 non-null    object
 2   report_url  148 non-null    object
 3   text        148 non-null    object
dtypes: object(4)
memory usage: 4.8+ KB


In [4]:
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR101029003875319808,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Election Day: MARCH 1"",""Vote for a Brighter F..."
1,CR548953410917892096,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Stand with Captain Sam Brown. Let's restore a..."
2,CR286730195614826496,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Let's make voting simpler, more convenient, a..."
3,CR212564562908545024,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Texas's 29th Congressional District"
4,CR493844926061084672,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Let's make voting simpler, more convenient, a..."


In [5]:
# check the dataset 
if df_text['text'].isnull().sum():
    print (df_text['text'].isnull().sum())
    df_text = df_text[df_text['text'].notnull()]
    df_text.reset_index(drop=True,inplace=True)

In [6]:
# function that filter the urls and symbols in the text 
def filter_text(x):
    url = 'http[s]?://\S+'
    x = re.sub(url,'',x)
    x = re.sub("[^\w\s]",' ',x) # filter symbols
    x = re.sub("\s+",' ',x)
    
    ls=[w.lower() for w in x.split()] 
    
    return ' '.join(ls)

In [7]:
df_text['text'] = df_text['text'].astype(str).apply(lambda x: filter_text(x))
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR101029003875319808,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,election day march 1 vote for a brighter future
1,CR548953410917892096,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,stand with captain sam brown let s restore acc...
2,CR286730195614826496,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,let s make voting simpler more convenient and ...
3,CR212564562908545024,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,texas s 29th congressional district
4,CR493844926061084672,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,let s make voting simpler more convenient and ...


In [8]:
# remove duplicated rows
df_text.drop_duplicates(subset = 'text',keep = 'first',inplace = True)
df_text.reset_index(inplace = True,drop = True)
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR101029003875319808,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,election day march 1 vote for a brighter future
1,CR548953410917892096,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,stand with captain sam brown let s restore acc...
2,CR286730195614826496,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,let s make voting simpler more convenient and ...
3,CR212564562908545024,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,texas s 29th congressional district
4,CR493844926061084672,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,let s make voting simpler more convenient and ...


In [9]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       88 non-null     object
 1   ad_url      88 non-null     object
 2   report_url  88 non-null     object
 3   text        88 non-null     object
dtypes: object(4)
memory usage: 2.9+ KB


In [10]:
df_text.to_csv('tx_ad.csv')

In [11]:
# import the Lexicon
with open ('lexicon.json') as f1:
    dic1 = json.load(f1)
# check the keys
dic1.keys()

dict_keys(['covid', 'economic', 'education', 'environment', 'foreign policy', 'governance', 'health', 'immigration', 'military', 'safety', 'social and cultural', 'social programs', 'donation'])

In [12]:
# import en_core_web
#This only happens to Ying that she couldn't import 'en_core_web_sm' in Jupyter Notebook, so she imports it with the full path
nlp = spacy.load("/usr/local/Cellar/jupyterlab/3.2.9/libexec/lib/python3.9/site-packages/en_core_web_lg/en_core_web_lg-3.2.0/")

In [13]:
# the function that find the lexicon words in the text
def find_words(x,lexicon):
    topics= lexicon.keys()  
    doc = nlp(x) # nlp() is spaCy 2.2 English language model 
    words= []
    for t in topics:
        matcher = PhraseMatcher(nlp.vocab)
        terms= lexicon[t]
        patterns = [nlp.make_doc(text) for text in terms]
        matcher.add("TerminologyList", patterns) # spaCy2.2 phrase matcher
        matches = matcher(doc)
        for match_id, start,end in matches:
            span = doc[start:end]
            words.append(span.text)
    if words:
        words = list(set(words))
        return ','.join(words)
    else:
        return('no words')

In [14]:
# tagging the topic in each message
def find_topic(x,lexicon):
    topics= lexicon.keys()    
    if x=='no words':
        return ''    
    if x != 'no words': 
        words = x.split(',')
        labels = []        
        for t in topics:            
            terms = lexicon[t]
            if set(words)&set(terms):
                labels.append(t)                
                #l = sorted(labels)        
        return  ','.join(sorted(labels))
                
        #return ','.join(labels)
            

In [15]:
df_text['words'] = df_text['text'].astype(str).apply(lambda x: find_words(x,dic1))
df_text['m_label'] = df_text['words'].apply(lambda x: find_topic(x,dic1))
df_text['m_label'] = df_text['m_label'].apply(lambda x: 'no topic' if x=='' else x)
df_text

Unnamed: 0,ad_id,ad_url,report_url,text,words,m_label
0,CR101029003875319808,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,election day march 1 vote for a brighter future,no words,no topic
1,CR548953410917892096,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,stand with captain sam brown let s restore acc...,no words,no topic
2,CR286730195614826496,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,let s make voting simpler more convenient and ...,no words,no topic
3,CR212564562908545024,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,texas s 29th congressional district,no words,no topic
4,CR493844926061084672,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,let s make voting simpler more convenient and ...,no words,no topic
...,...,...,...,...,...,...
83,CR195958089038430208,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,standing up for our rights,no words,no topic
84,CR79646801249959936,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,mike will defend voting rights protect the rig...,"right to choose,voting rights","governance,social and cultural"
85,CR231000074371465216,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,the austin chronicle officially backs jay kleb...,no words,no topic
86,CR526759906150187008,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,harris county commissioner rodney ellis offici...,"climate,climate change",environment


In [16]:
def count_tag(df_text):
    df_tag = df_text['m_label'].value_counts().rename_axis('topics').reset_index(name='counts')
    df_tag = df_tag.assign(single_topic=df_tag['topics'].str.split(',')).explode('single_topic').reset_index(drop = True)
    df_tag = df_tag.groupby('single_topic').sum().reset_index().sort_values(by = 'counts', ascending = False)
    return df_tag

In [17]:
count_tag(df_text)

Unnamed: 0,single_topic,counts
9,no topic,50
7,immigration,14
2,economic,11
6,governance,10
10,safety,4
1,donation,3
4,environment,3
11,social and cultural,3
8,military,2
0,covid,1


In [18]:
def count_word(df_text):
    df_words = df_text['words'].value_counts().rename_axis('words').reset_index(name='counts')
    df_words = df_words.assign(single_word=df_words['words'].str.split(',')).explode('single_word').reset_index(drop = True)
    df_words = df_words.groupby('single_word').sum().reset_index().sort_values(by = 'counts', ascending = False)
    df_words.reset_index(drop = True, inplace = True)
    return df_words

In [19]:
count_word(df_text)

Unnamed: 0,single_word,counts
0,no words,50
1,border,9
2,tax,5
3,taxes,4
4,immigration,3
5,right to choose,3
6,voting rights,3
7,climate change,3
8,climate,3
9,police,2
