In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
import spacy
from spacy.matcher import PhraseMatcher

# Scrape all ad texts from urls

In [3]:
# import the data
df = pd.read_csv('Texas2021.csv')
df = pd.DataFrame(df)
df.head()

Unnamed: 0,ad_id,ad_url,ad_type,regions,advertiser_id,advertiser_name,ad_campaigns_list,date_range_start,date_range_end,num_of_days,...,spend_range_min_pln,spend_range_max_pln,spend_range_min_ron,spend_range_max_ron,spend_range_min_sek,spend_range_max_sek,spend_range_min_gbp,spend_range_max_gbp,spend_range_min_nzd,spend_range_max_nzd
0,CR134968385204125696,https://transparencyreport.google.com/politica...,Text,US,AR465982957815857152,LYDIA BEAN FOR CONGRESS,,2021-03-11,2021-05-01,49,...,100,1000.0,750,37500.0,750,7500.0,50,500.0,200,1500.0
1,CR349206645328314368,https://transparencyreport.google.com/politica...,Image,US,AR221691299693068288,SAHAK NALBANDYAN,,2022-01-12,2022-02-02,22,...,100,1000.0,75,750.0,750,7500.0,50,500.0,0,200.0
2,CR531379229376380928,https://transparencyreport.google.com/politica...,Text,US,AR279687823638921216,Glenn A. Hegar,,2022-01-05,2022-03-01,47,...,100,1000.0,75,750.0,750,7500.0,50,500.0,0,200.0
3,CR133689996778405888,https://transparencyreport.google.com/politica...,Image,US,AR140325824329940992,Chad Crow,,2022-02-17,2022-03-01,13,...,100,1000.0,75,750.0,750,7500.0,50,500.0,0,200.0
4,CR118816009636282368,https://transparencyreport.google.com/politica...,Text,US,AR393003423278039040,"CAITLYN JENNER FOR GOVERNOR 2021, INC.",,2021-05-03,2021-05-26,17,...,100,1000.0,750,37500.0,750,7500.0,50,500.0,200,1500.0


In [4]:
# create a list of urls from text ads
urls = df[df['ad_type']=='Text']['ad_url'].to_list()
urls[1:10]

['https://transparencyreport.google.com/political-ads/library/advertiser/AR279687823638921216/creative/CR531379229376380928',
 'https://transparencyreport.google.com/political-ads/library/advertiser/AR393003423278039040/creative/CR118816009636282368',
 'https://transparencyreport.google.com/political-ads/library/advertiser/AR393003423278039040/creative/CR134265247518162944',
 'https://transparencyreport.google.com/political-ads/library/advertiser/AR67037258305241088/creative/CR502092019183648768',
 'https://transparencyreport.google.com/political-ads/library/advertiser/AR465982957815857152/creative/CR12089164707135488',
 'https://transparencyreport.google.com/political-ads/library/advertiser/AR122301255337902080/creative/CR528988616219688960',
 'https://transparencyreport.google.com/political-ads/library/advertiser/AR389693240443404288/creative/CR496800138438639616',
 'https://transparencyreport.google.com/political-ads/library/advertiser/AR19054570869096448/creative/CR8196477791974195

In [5]:
# chech the number of urls
len(urls)

654

In [6]:
# Google uses a secret API to get the ad texts from ad reports and import them into the ads pages
# use entity_id and creative_id to get url for each report
report_urls = []
for url in urls:
    entity_id = url.split('/')[-3]
    creative_id = url.split('/')[-1]
    report_url = 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={}&creative_id={}&hl=en'.format(entity_id,creative_id)
    report_urls.append(report_url)
report_urls[1:10]

['https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR279687823638921216&creative_id=CR531379229376380928&hl=en',
 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR393003423278039040&creative_id=CR118816009636282368&hl=en',
 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR393003423278039040&creative_id=CR134265247518162944&hl=en',
 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR67037258305241088&creative_id=CR502092019183648768&hl=en',
 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR465982957815857152&creative_id=CR12089164707135488&hl=en',
 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id=AR122301255337902080&creative_id=CR528988616219688

In [7]:
# check if the number of the reports matches the number of urls
len(report_urls)

654

In [8]:
df_report = pd.DataFrame(report_urls)

In [9]:
# now, let's scrape the ads text from each report
def get_text(report_url):
    response = requests.get(report_url)
    text = response.text.split('"]')[0].split('[')[-1]
    return text

In [10]:
ad_text = []
for report_url in report_urls:
    text = get_text(report_url)
    ad_text.append(text)

In [11]:
len(ad_text)

654

In [12]:
df_texas = pd.DataFrame(ad_text)

In [13]:
df_texas.rename(columns = {0:'text'},inplace = True)

In [14]:
# create a new dataframe with ad_id, ad_url,report_url, and text

In [15]:
df_new = df[df['ad_type']=='Text'][['ad_id','ad_url']].reset_index(drop = True)
df_new

Unnamed: 0,ad_id,ad_url
0,CR134968385204125696,https://transparencyreport.google.com/politica...
1,CR531379229376380928,https://transparencyreport.google.com/politica...
2,CR118816009636282368,https://transparencyreport.google.com/politica...
3,CR134265247518162944,https://transparencyreport.google.com/politica...
4,CR502092019183648768,https://transparencyreport.google.com/politica...
...,...,...
649,CR10729618579390464,https://transparencyreport.google.com/politica...
650,CR120543342403518464,https://transparencyreport.google.com/politica...
651,CR544406174422794240,https://transparencyreport.google.com/politica...
652,CR242540204819218432,https://transparencyreport.google.com/politica...


In [16]:
df_full = pd.concat([df_new,df_report,df_texas],axis=1)

In [17]:
df_full.to_csv('full_url_text.csv')

# Clean the text ads

In [18]:
df_texas

Unnamed: 0,text
0,"""Lydia has always stood for what's right, just..."
1,"""Glenn Hegar is a sixth generation Texan runni..."
2,"""Caitlyn Is The Change California Needs. Donat..."
3,"""Get Rid Of Gavin Once And For All. Caitlyn Ha..."
4,"""Will we join the left to take America backwar..."
...,...
649,"""California Deserves Better. California Deserv..."
650,"""Newsom Recall Is Official! Caitlyn Can Fix Ca..."
651,"""Newsom Recall Is Official! Caitlyn Can Fix Ca..."
652,"""Don't Let Chuck Schumer and Joe Biden Impleme..."


In [19]:
# check the dataset 
if df_texas['text'].isnull().sum():
    print (df_texas['text'].isnull().sum())
    df_texas = df_texas[df_texas['text'].notnull()]
    df_texas.reset_index(drop=True,inplace=True)

In [20]:
# function that filter the urls and symbols in the text 
def filter_text(x):
    url = 'http[s]?://\S+'
    x = re.sub(url,'',x)
    x = re.sub("[^\w\s]",' ',x) # filter symbols
    x = re.sub("\s+",' ',x)
    
    ls=[w.lower() for w in x.split()] 
    
    return ' '.join(ls)

In [21]:
df_texas['text'] = df_texas['text'].astype(str).apply(lambda x: filter_text(x))
df_texas.head()

Unnamed: 0,text
0,lydia has always stood for what s right just a...
1,glenn hegar is a sixth generation texan runnin...
2,caitlyn is the change california needs donate ...
3,get rid of gavin once and for all caitlyn has ...
4,will we join the left to take america backward...


In [22]:
# check if there are duplicated texts, if yes, print them out
df_texas['text'].value_counts().head(20)

ca has had enough of newsom caitlyn can take back our state donate now help take back california newsom is a disaster donate to make ca red                                     19
newsom recall is official caitlyn can fix california donate now did you see the news gavin newsom will be recalled caitlyn can fix ca chip in 5                                 17
get rid of gavin once and for all caitlyn has a plan to save california donate today it s official newsom will be recalled caitlyn will put people over politics chip in        13
californians deserve leadership who is accountable to them caitlyn will be chip in now caitlyn will fight for you she is an outsider and a proven winner donate now             12
caitlyn is an outsider sacramento special interests have no power over her donate now drain the ca swamp newsom is controlled by special interests caitlyn is not donate now    12
californians deserve better from their governor caitlyn will fix it donate now gavin newsom orders us to 

In [23]:
# remove duplicated rows
df_texas.drop_duplicates(keep = 'first', inplace = True)
df_texas.reset_index(inplace = True,drop = True)

In [24]:
# the "&" shows as "/u0026" in the text, let's remove it
df_texas['text'] = df_texas['text'].replace('/u0026', ' ')

In [25]:
df_texas.head(10)

Unnamed: 0,text
0,lydia has always stood for what s right just a...
1,glenn hegar is a sixth generation texan runnin...
2,caitlyn is the change california needs donate ...
3,get rid of gavin once and for all caitlyn has ...
4,will we join the left to take america backward...
5,lydia stands for quality schools more jobs and...
6,states are passing laws to silence voters fede...
7,this campaign is about what we can achieve whe...
8,shawn lassiter is a former stem educator and a...
9,gov abbott does not stand with texas women he ...


# Tag the text with Spacy

In [28]:
# import the Lexicon
with open ('lexicon.json') as f1:
    dic1 = json.load(f1)
# check the keys
dic1.keys()

JSONDecodeError: Expecting ',' delimiter: line 374 column 9 (char 9053)

In [None]:
# import en_core_web
#This only happens to Ying that she couldn't import 'en_core_web_sm' in Jupyter Notebook, so she imports it with the full path
nlp = spacy.load("/usr/local/Cellar/jupyterlab/3.2.9/libexec/lib/python3.9/site-packages/en_core_web_lg/en_core_web_lg-3.2.0/")

In [None]:
# the function that find the lexicon words in the text
def find_words(x,lexicon):
    topics= lexicon.keys()  
    doc = nlp(x) # nlp() is spaCy 2.2 English language model 
    words= []
    for t in topics:
        matcher = PhraseMatcher(nlp.vocab)
        terms= lexicon[t]
        patterns = [nlp.make_doc(text) for text in terms]
        matcher.add("TerminologyList", patterns) # spaCy2.2 phrase matcher
        matches = matcher(doc)
        for match_id, start,end in matches:
            span = doc[start:end]
            words.append(span.text)
    if words:
        words = list(set(words))
        return ','.join(words)
    else:
        return('no words')

In [None]:
# tagging the topic in each message
def find_topic(x,lexicon):
    topics= lexicon.keys()    
    if x=='no words':
        return ''    
    if x != 'no words': 
        words = x.split(',')
        labels = []        
        for t in topics:            
            terms = lexicon[t]
            if set(words)&set(terms):
                labels.append(t)                
                #l = sorted(labels)        
        return  ','.join(sorted(labels))
                
        #return ','.join(labels)
            

In [None]:
df_texas['words'] = df_texas['text'].astype(str).apply(lambda x: find_words(x,dic1))

In [None]:
df_texas['m_label'] = df_texas['words'].apply(lambda x: find_topic(x,dic1))

In [None]:
df_texas['m_label'] = df_texas['m_label'].apply(lambda x: 'no topic' if x=='' else x)

In [None]:
df_texas.to_csv('texas_ads_tags.csv')

In [None]:
df_texas[df_texas['m_label'] == 'immigration'].to_csv('immigration.csv')

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df_texas[df_texas['m_label'] == 'no topic']

In [None]:
# check each tags
df_texas['m_label'].value_counts()

In [None]:
# check words
df_texas['words'].value_counts()