In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
import spacy
from spacy.matcher import PhraseMatcher

In [2]:
def get_text_from_csv(fn): # get ad's text from csv file, return a dataframe with text, ad_id,ad_url, and report_url
    df = pd.read_csv(fn)
    df = df[df['ad_type'] == 'Text'].reset_index(drop = True) # only get text ads
    df_url = df[['ad_id','ad_url']].reset_index(drop = True) # get the urls
    urls = df_url['ad_url'].to_list() # create a list so we could get the report_urls
    report_urls = []
    for url in urls:
        entity_id = url.split('/')[-3]
        creative_id = url.split('/')[-1]
        report_url = 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={}&creative_id={}&hl=en'.format(entity_id,creative_id)
        report_urls.append(report_url)
    df_report = pd.DataFrame(report_urls).rename(columns = {0:'report_url'})
    ad_text = [] # get texts from reports
    for report_url in report_urls:
        response = requests.get(report_url)
        text = response.text.split('"]')[0].split('[')[-1]
        ad_text.append(text)  
    df_text = pd.DataFrame(ad_text).rename(columns = {0:'text'})
    df_new = df[df['ad_type']=='Text'][['ad_id','ad_url']].reset_index(drop = True) 
    df = pd.concat([df_new,df_report,df_text],axis=1)
    return df 

In [3]:
df_text = get_text_from_csv('GoogleAds/NY.csv')
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       117 non-null    object
 1   ad_url      117 non-null    object
 2   report_url  117 non-null    object
 3   text        117 non-null    object
dtypes: object(4)
memory usage: 3.8+ KB


In [4]:
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR145945153381597184,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""Representative for New York's First Congressi..."
1,CR66373359440494592,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""New Yorkers deserve a governor who works as h..."
2,CR103553482572693504,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""\""Help Khaled \"" to fight the three enemies o..."
3,CR474309078336667648,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""\""Help Khaled \"" to fight the three enemies o..."
4,CR277557794738012160,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,"""“Simply Put, your vote count to get Free educ..."


In [5]:
# check the dataset 
if df_text['text'].isnull().sum():
    print (df_text['text'].isnull().sum())
    df_text = df_text[df_text['text'].notnull()]
    df_text.reset_index(drop=True,inplace=True)

In [6]:
# function that filter the urls and symbols in the text 
def filter_text(x):
    url = 'http[s]?://\S+'
    x = re.sub(url,'',x)
    x = re.sub("[^\w\s]",' ',x) # filter symbols
    x = re.sub("\s+",' ',x)
    
    ls=[w.lower() for w in x.split()] 
    
    return ' '.join(ls)

In [7]:
df_text['text'] = df_text['text'].astype(str).apply(lambda x: filter_text(x))
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR145945153381597184,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,representative for new york s first congressio...
1,CR66373359440494592,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,new yorkers deserve a governor who works as ha...
2,CR103553482572693504,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,help khaled to fight the three enemies of huma...
3,CR474309078336667648,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,help khaled to fight the three enemies of huma...
4,CR277557794738012160,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,simply put your vote count to get free educati...


In [8]:
# remove duplicated rows
df_text.drop_duplicates(subset = 'text',keep = 'first',inplace = True)
df_text.reset_index(inplace = True,drop = True)
df_text.head()

Unnamed: 0,ad_id,ad_url,report_url,text
0,CR145945153381597184,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,representative for new york s first congressio...
1,CR66373359440494592,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,new yorkers deserve a governor who works as ha...
2,CR103553482572693504,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,help khaled to fight the three enemies of huma...
3,CR277557794738012160,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,simply put your vote count to get free educati...
4,CR268569836936757248,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,robert needs your help to stop corporate pacs ...


In [9]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ad_id       63 non-null     object
 1   ad_url      63 non-null     object
 2   report_url  63 non-null     object
 3   text        63 non-null     object
dtypes: object(4)
memory usage: 2.1+ KB


In [10]:
df_text.to_csv('ny_ad.csv')

In [11]:
# import the Lexicon
with open ('lexicon.json') as f1:
    dic1 = json.load(f1)
# check the keys
dic1.keys()

dict_keys(['covid', 'economic', 'education', 'environment', 'foreign policy', 'governance', 'health', 'immigration', 'military', 'safety', 'social and cultural', 'social programs', 'donation'])

In [12]:
# import en_core_web
#This only happens to Ying that she couldn't import 'en_core_web_sm' in Jupyter Notebook, so she imports it with the full path
nlp = spacy.load("/usr/local/Cellar/jupyterlab/3.2.9/libexec/lib/python3.9/site-packages/en_core_web_lg/en_core_web_lg-3.2.0/")

In [13]:
# the function that find the lexicon words in the text
def find_words(x,lexicon):
    topics= lexicon.keys()  
    doc = nlp(x) # nlp() is spaCy 2.2 English language model 
    words= []
    for t in topics:
        matcher = PhraseMatcher(nlp.vocab)
        terms= lexicon[t]
        patterns = [nlp.make_doc(text) for text in terms]
        matcher.add("TerminologyList", patterns) # spaCy2.2 phrase matcher
        matches = matcher(doc)
        for match_id, start,end in matches:
            span = doc[start:end]
            words.append(span.text)
    if words:
        words = list(set(words))
        return ','.join(words)
    else:
        return('no words')

In [14]:
# tagging the topic in each message
def find_topic(x,lexicon):
    topics= lexicon.keys()    
    if x=='no words':
        return ''    
    if x != 'no words': 
        words = x.split(',')
        labels = []        
        for t in topics:            
            terms = lexicon[t]
            if set(words)&set(terms):
                labels.append(t)                
                #l = sorted(labels)        
        return  ','.join(sorted(labels))
                
        #return ','.join(labels)
            

In [15]:
df_text['words'] = df_text['text'].astype(str).apply(lambda x: find_words(x,dic1))
df_text['m_label'] = df_text['words'].apply(lambda x: find_topic(x,dic1))
df_text['m_label'] = df_text['m_label'].apply(lambda x: 'no topic' if x=='' else x)
df_text

Unnamed: 0,ad_id,ad_url,report_url,text,words,m_label
0,CR145945153381597184,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,representative for new york s first congressio...,no words,no topic
1,CR66373359440494592,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,new yorkers deserve a governor who works as ha...,no words,no topic
2,CR103553482572693504,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,help khaled to fight the three enemies of huma...,"enemies,free education,american students,educa...","education,foreign policy,social programs"
3,CR277557794738012160,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,simply put your vote count to get free educati...,"enemies,free education,american students,educa...","education,foreign policy,social programs"
4,CR268569836936757248,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,robert needs your help to stop corporate pacs ...,chip in,donation
...,...,...,...,...,...,...
58,CR401070333932601344,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,new york s 3rd congressional district,no words,no topic
59,CR55439815813890048,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,gov kathy hochul is working hard and cutting t...,"taxes,middle class",economic
60,CR90712286271897600,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,governor kathy hochul has brought a fresh and ...,no words,no topic
61,CR379951464342093824,https://transparencyreport.google.com/politica...,https://transparencyreport.google.com/transpar...,we need a new generation of leadership in wash...,guns,safety


In [16]:
def count_tag(df_text):
    df_tag = df_text['m_label'].value_counts().rename_axis('topics').reset_index(name='counts')
    df_tag = df_tag.assign(single_topic=df_tag['topics'].str.split(',')).explode('single_topic').reset_index(drop = True)
    df_tag = df_tag.groupby('single_topic').sum().reset_index().sort_values(by = 'counts', ascending = False)
    return df_tag

In [17]:
count_tag(df_text)

Unnamed: 0,single_topic,counts
7,no topic,29
0,donation,15
1,economic,7
8,safety,6
2,education,2
3,environment,2
4,foreign policy,2
5,governance,2
9,social programs,2
6,health,1


In [18]:
def count_word(df_text):
    df_words = df_text['words'].value_counts().rename_axis('words').reset_index(name='counts')
    df_words = df_words.assign(single_word=df_words['words'].str.split(',')).explode('single_word').reset_index(drop = True)
    df_words = df_words.groupby('single_word').sum().reset_index().sort_values(by = 'counts', ascending = False)
    df_words.reset_index(drop = True, inplace = True)
    return df_words

In [19]:
count_word(df_text)

Unnamed: 0,single_word,counts
0,no words,29
1,chip in,7
2,donate,7
3,middle class,6
4,taxes,4
5,bail,4
6,gun violence,4
7,gun,4
8,free education,2
9,filibuster,2
