In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import json
import spacy
from spacy.matcher import PhraseMatcher

In [3]:
# function that filter the urls and symbols in the text 
def clean_text(df_text):
    def filter_text(x):
        url = 'http[s]?://\S+'
        x = re.sub(url,'',x)
        x = re.sub("[^\w\s]",' ',x) # filter symbols
        x = re.sub("\s+",' ',x)
        ls=[w.lower() for w in x.split()] 
        return ' '.join(ls)
    df_text['text'] = df_text['text'].astype(str).apply(lambda x: filter_text(x))
    df_text['text'] = df_text['text'].replace('/u0026', ' ')
    df_text.drop_duplicates(subset = 'text',keep = 'first', inplace = True)
    df_text.reset_index(drop = True,inplace = True)
    return df_text

In [4]:
# the function that find the lexicon words in the text
def find_words(x,lexicon):
    topics= lexicon.keys()  
    nlp = spacy.load('en_core_web_lg')
    doc = nlp(x) # nlp() is spaCy 2.2 English language model 
    words= []
    for t in topics:
        matcher = PhraseMatcher(nlp.vocab)
        terms= lexicon[t]
        patterns = [nlp.make_doc(text) for text in terms]
        matcher.add("TerminologyList", patterns) # spaCy2.2 phrase matcher
        matches = matcher(doc)
        for match_id, start,end in matches:
            span = doc[start:end]
            words.append(span.text)
    if words:
        words = list(set(words))
        return ','.join(words)
    else:
        return('no words')

In [5]:
# tagging the topic in each message
def find_topic(x,lexicon):
    topics= lexicon.keys()    
    if x=='no words':
        return ''    
    if x != 'no words': 
        words = x.split(',')
        labels = []        
        for t in topics:            
            terms = lexicon[t]
            if set(words)&set(terms):
                labels.append(t)                
                #l = sorted(labels)        
        return  ','.join(sorted(labels))
                
        #return ','.join(labels)

In [15]:
def get_word_lable(df_text,lexicon):
    with open (lexicon) as f:
        dic = json.load(f)
    df_text['words'] = df_text['text'].astype(str).apply(lambda x: find_words(x,dic))
    df_text['m_label'] = df_text['words'].apply(lambda x: find_topic(x,dic))
    df_text['m_label'] = df_text['m_label'].apply(lambda x: 'no topic' if x=='' else x)
    return df_text

In [6]:
def count_topic(df_text):
    df_tag = df_text['m_label'].value_counts().rename_axis('topics').reset_index(name='counts')
    df_tag = df_tag.assign(single_topic=df_tag['topics'].str.split(',')).explode('single_topic').reset_index(drop = True)
    df_tag = df_tag.groupby('single_topic').sum().reset_index().sort_values(by = 'counts', ascending = False)
    df_tag.reset_index(drop = True, inplace = True)
    return df_tag

In [7]:
def count_word(df_text):
    df_words = df_text['words'].value_counts().rename_axis('words').reset_index(name='counts')
    df_words = df_words.assign(single_word=df_words['words'].str.split(',')).explode('single_word').reset_index(drop = True)
    df_words = df_words.groupby('single_word').sum().reset_index().sort_values(by = 'counts', ascending = False)
    df_words.reset_index(drop = True, inplace = True)
    return df_words

In [8]:
# go through the csv file, get youtube video's ids from all video ads. Add new column 'youtube_id' 
# before running this function, make sure the csv file includes ad_url
def get_youtube_id(fn): 
    df = pd.read_csv(fn)
    df = df[df['ad_type'] == 'Video'].reset_index(drop = True) # only get video ads
    df_video = df[['ad_url','advertiser_name','impressions','spend_usd']].reset_index(drop = True) 
    urls = df_video['ad_url'].to_list() # 
    youtube_ids = [] # get youtube id
    for url in urls:
        entity_id = url.split('/')[-3]
        creative_id = url.split('/')[-1]
        report_url = 'https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={}&creative_id={}&hl=en'.format(entity_id,creative_id)
        response = requests.get(report_url)
        try:
            youtube_id = response.text.split('"')[3]
            if len(list(youtube_id)) > 11: # mark ads violating google polices
                youtube_id = 'youtube_id not available: this ad violated google ad policies.' 
            #print(youtube_id + ', ' + creative_id)
        except IndexError: # mark ads cannot be loaded
            youtube_id = 'youtube_id not available: cannot load the video with this ad_url.'
            #print("can't load this video, " + report_url)
            pass
        youtube_ids.append(youtube_id)
    df_video['youtube_id'] = youtube_ids
    return df_video

In [9]:
# check all videos, see which are available and drop duplicates. Add a new column video_available
def check_video(df_video): 
  for id in df_video['youtube_id']:
    if len(list(id)) == 11:
      df_video.drop_duplicates(subset = 'youtube_id',keep = 'first', inplace = True)
      df_video.reset_index(drop = True, inplace=True)
      yes_video = df_video['youtube_id'].str.len() == 11
      df_video['video_available'] = yes_video
      return df_video

In [10]:
# use youtube_id to get captions. Add a new column youtube_captions
def get_captions(df_video):
  youtube_captions = []
  for youtube_id in df_video['youtube_id']:
    try:
      subs = YouTubeTranscriptApi.get_transcript(youtube_id)
      alist = []
      for sub in subs:
        alist.append(" " + sub['text'])
      captions = ""
      for item in alist:
        captions += item
    except Exception as e:
      captions = e   
    youtube_captions.append(captions)
  #print(len(youtube_captions))
  df_video['text'] = youtube_captions
  return df_video

# CA

In [11]:
ca_video = get_youtube_id('GoogleAds/CA.csv')
ca_video_clean = check_video(ca_video)
ca_captions = get_captions(ca_video_clean)

In [12]:
# remove Lost Debate and Turning Point
remove_advertiser = ['Lost Debate Inc','TURNING POINT USA, NFP']
ca_captions = ca_captions[-ca_captions['advertiser_name'].isin(remove_advertiser)]
ca_captions

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders sum...
1,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,C0Bwn5e2Fqc,True,[Music] [Music] all right hello and welcome a...
2,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,O8v9Nk_PKpM,True,i am a big uh supporter of the highway 37 exp...
4,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,phLp03VtiSg,True,hello and welcome everyone to another episode...
5,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,IUQYgYSl0Pc,True,yeah kind of as as we're kind of coming to a ...
6,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,VsNmklD4Lkk,True,like i said you know we're still in the midst...
8,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,8Fj9aEnCZq0,True,attention homeowners if you have a meter like...
9,https://transparencyreport.google.com/politica...,NRSC,≤ 10k,≤ 100,Yg5eDw33DFk,True,\nCould not retrieve a transcript for the vide...
10,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,n8p9bcRbLxs,True,\nCould not retrieve a transcript for the vide...
11,https://transparencyreport.google.com/politica...,NEWSOM FOR CALIFORNIA – GOVERNOR – 2022,≤ 10k,≤ 100,gwuPy9FiaIA,True,hey everybody it's gavin and i can't believe ...


In [13]:
# remove unavialable videos
ca_captions = ca_captions[ca_captions['video_available'] == True]
ca_captions_clean = clean_text(ca_captions)
ca_captions_clean

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders summ...
1,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,C0Bwn5e2Fqc,True,music music all right hello and welcome again ...
2,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,O8v9Nk_PKpM,True,i am a big uh supporter of the highway 37 expa...
3,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,phLp03VtiSg,True,hello and welcome everyone to another episode ...
4,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,IUQYgYSl0Pc,True,yeah kind of as as we re kind of coming to a c...
5,https://transparencyreport.google.com/politica...,JARHETT BLONIEN,≤ 10k,≤ 100,VsNmklD4Lkk,True,like i said you know we re still in the midst ...
6,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,8Fj9aEnCZq0,True,attention homeowners if you have a meter like ...
7,https://transparencyreport.google.com/politica...,NRSC,≤ 10k,≤ 100,Yg5eDw33DFk,True,could not retrieve a transcript for the video ...
8,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,n8p9bcRbLxs,True,could not retrieve a transcript for the video ...
9,https://transparencyreport.google.com/politica...,NEWSOM FOR CALIFORNIA – GOVERNOR – 2022,≤ 10k,≤ 100,gwuPy9FiaIA,True,hey everybody it s gavin and i can t believe i...


In [16]:
ca_captions_tagged = get_word_lable(ca_captions_clean,'lexicon.json')
ca_captions_tagged
ca_captions_tagged.to_csv('ca_captions_tagged.csv')

In [17]:
ca_topic = count_topic(ca_captions_tagged)
ca_topic

Unnamed: 0,single_topic,counts
0,economic,25
1,health,20
2,governance,18
3,education,17
4,social and cultural,17
5,safety,17
6,environment,15
7,covid,13
8,social programs,13
9,continue with the folks,10


In [18]:
ca_word = count_word(ca_captions_tagged)
ca_word

Unnamed: 0,single_word,counts
0,health,18
1,school,13
2,law,11
3,jobs,10
4,pandemic,10
...,...,...
249,immigration reform,1
250,in poverty,1
251,instability,1
252,insurance company,1


In [19]:
ca_captions_tagged['advertiser_name'].value_counts()

JARHETT BLONIEN                                                                                                                                                                                      22
SAHAK NALBANDYAN                                                                                                                                                                                      3
NEWSOM FOR CALIFORNIA – GOVERNOR – 2022                                                                                                                                                               3
Vincent Collis                                                                                                                                                                                        2
Californians for Solutions to Homelessness and Mental Health Support, a Coalition of Housing and Mental Health Experts, Concerned Taxpayers and Digital Sports Entertainment and Gaming Companies     2


# TX

In [20]:
tx_video = get_youtube_id('GoogleAds/texas.csv')
tx_video_clean = check_video(tx_video)
tx_captions = get_captions(tx_video_clean)
tx_captions

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders sum...
1,https://transparencyreport.google.com/politica...,Chad Crow,≤ 10k,≤ 100,7ne91Qj4XEo,True,i believe that free and fair elections are on...
2,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,youtube_id not available: cannot load the vide...,False,\nCould not retrieve a transcript for the vide...
3,https://transparencyreport.google.com/politica...,Allen West,≤ 10k,≤ 100,9i4lMGfNTRA,True,hey i'm ted nugent full-time i'm addicted to ...
4,https://transparencyreport.google.com/politica...,JESSICA CISNEROS FOR CONGRESS,≤ 10k,≤ 100,B1ozJ2JH8BE,True,tonight fbi agents at the home of henry cuell...
...,...,...,...,...,...,...,...
151,https://transparencyreport.google.com/politica...,Hohman for Congress,10k-100k,100-1k,8Jf4sxrOkzo,True,you know the drone is overhead but you can't ...
152,https://transparencyreport.google.com/politica...,John N. Raney,10k-100k,100-1k,_Ff2-uWTpl0,True,texans can count on john rainey rainey is fig...
153,https://transparencyreport.google.com/politica...,TEXANS FOR HENRY CUELLAR CONGRESSIONAL CAMPAIGN,10k-100k,100-1k,idEsiU0jBiA,True,[Music] get up early work hard my parents sai...
154,https://transparencyreport.google.com/politica...,Doctor Matt Rostami For Texas Campaign,10k-100k,100-1k,4ynkJPbh4Ns,True,to grow the republican party we're going to n...


In [21]:
remove_advertiser = ['Lost Debate Inc','TURNING POINT USA, NFP']
tx_captions = tx_captions[-tx_captions['advertiser_name'].isin(remove_advertiser)]
tx_captions

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders sum...
1,https://transparencyreport.google.com/politica...,Chad Crow,≤ 10k,≤ 100,7ne91Qj4XEo,True,i believe that free and fair elections are on...
3,https://transparencyreport.google.com/politica...,Allen West,≤ 10k,≤ 100,9i4lMGfNTRA,True,hey i'm ted nugent full-time i'm addicted to ...
4,https://transparencyreport.google.com/politica...,JESSICA CISNEROS FOR CONGRESS,≤ 10k,≤ 100,B1ozJ2JH8BE,True,tonight fbi agents at the home of henry cuell...
5,https://transparencyreport.google.com/politica...,JESSICA CISNEROS FOR CONGRESS,≤ 10k,≤ 100,noinAynqn28,True,\nCould not retrieve a transcript for the vide...
...,...,...,...,...,...,...,...
151,https://transparencyreport.google.com/politica...,Hohman for Congress,10k-100k,100-1k,8Jf4sxrOkzo,True,you know the drone is overhead but you can't ...
152,https://transparencyreport.google.com/politica...,John N. Raney,10k-100k,100-1k,_Ff2-uWTpl0,True,texans can count on john rainey rainey is fig...
153,https://transparencyreport.google.com/politica...,TEXANS FOR HENRY CUELLAR CONGRESSIONAL CAMPAIGN,10k-100k,100-1k,idEsiU0jBiA,True,[Music] get up early work hard my parents sai...
154,https://transparencyreport.google.com/politica...,Doctor Matt Rostami For Texas Campaign,10k-100k,100-1k,4ynkJPbh4Ns,True,to grow the republican party we're going to n...


In [22]:
# remove unavialable videos and clean texts
tx_captions_available = tx_captions[tx_captions['video_available'] == True]
tx_captions_clean = clean_text(tx_captions_available)
tx_captions_clean

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders summ...
1,https://transparencyreport.google.com/politica...,Chad Crow,≤ 10k,≤ 100,7ne91Qj4XEo,True,i believe that free and fair elections are one...
2,https://transparencyreport.google.com/politica...,Allen West,≤ 10k,≤ 100,9i4lMGfNTRA,True,hey i m ted nugent full time i m addicted to f...
3,https://transparencyreport.google.com/politica...,JESSICA CISNEROS FOR CONGRESS,≤ 10k,≤ 100,B1ozJ2JH8BE,True,tonight fbi agents at the home of henry cuella...
4,https://transparencyreport.google.com/politica...,JESSICA CISNEROS FOR CONGRESS,≤ 10k,≤ 100,noinAynqn28,True,could not retrieve a transcript for the video ...
...,...,...,...,...,...,...,...
120,https://transparencyreport.google.com/politica...,Hohman for Congress,10k-100k,100-1k,8Jf4sxrOkzo,True,you know the drone is overhead but you can t h...
121,https://transparencyreport.google.com/politica...,John N. Raney,10k-100k,100-1k,_Ff2-uWTpl0,True,texans can count on john rainey rainey is figh...
122,https://transparencyreport.google.com/politica...,TEXANS FOR HENRY CUELLAR CONGRESSIONAL CAMPAIGN,10k-100k,100-1k,idEsiU0jBiA,True,music get up early work hard my parents said t...
123,https://transparencyreport.google.com/politica...,Doctor Matt Rostami For Texas Campaign,10k-100k,100-1k,4ynkJPbh4Ns,True,to grow the republican party we re going to ne...


In [23]:
tx_captions_tagged = get_word_lable(tx_captions_clean,'lexicon.json')
tx_captions_tagged

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text,words,m_label
0,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders summ...,"transportation,greenhouse,pollution,economic i...","economic,environment,health"
1,https://transparencyreport.google.com/politica...,Chad Crow,≤ 10k,≤ 100,7ne91Qj4XEo,True,i believe that free and fair elections are one...,"election integrity,continue,voter fraud","continue with the folks,governance"
2,https://transparencyreport.google.com/politica...,Allen West,≤ 10k,≤ 100,9i4lMGfNTRA,True,hey i m ted nugent full time i m addicted to f...,"constitution,continue,addicted,stand up","continue with the folks,establishment,governan..."
3,https://transparencyreport.google.com/politica...,JESSICA CISNEROS FOR CONGRESS,≤ 10k,≤ 100,B1ozJ2JH8BE,True,tonight fbi agents at the home of henry cuella...,"corruption,fbi agents,washington,interests","establishment,governance,safety"
4,https://transparencyreport.google.com/politica...,JESSICA CISNEROS FOR CONGRESS,≤ 10k,≤ 100,noinAynqn28,True,could not retrieve a transcript for the video ...,"persian,estonian,turkish,khmer,kurdish",foreign policy
...,...,...,...,...,...,...,...,...,...
120,https://transparencyreport.google.com/politica...,Hohman for Congress,10k-100k,100-1k,8Jf4sxrOkzo,True,you know the drone is overhead but you can t h...,"guns,police,secure the border,border,vaccine,w...","covid,establishment,foreign policy,governance,..."
121,https://transparencyreport.google.com/politica...,John N. Raney,10k-100k,100-1k,_Ff2-uWTpl0,True,texans can count on john rainey rainey is figh...,"police,liberal,secure the border,border,law","governance,immigration,polarizing,safety"
122,https://transparencyreport.google.com/politica...,TEXANS FOR HENRY CUELLAR CONGRESSIONAL CAMPAIGN,10k-100k,100-1k,idEsiU0jBiA,True,music get up early work hard my parents said t...,"border security,middle class,border,tax,health...","economic,establishment,governance,health,immig..."
123,https://transparencyreport.google.com/politica...,Doctor Matt Rostami For Texas Campaign,10k-100k,100-1k,4ynkJPbh4Ns,True,to grow the republican party we re going to ne...,"insurance companies,health,expand medicaid,oba...","health,social programs"


In [24]:
tx_captions_tagged.to_csv('tx_captions_tagged.csv')

In [25]:
count_topic(tx_captions_tagged)

Unnamed: 0,single_topic,counts
0,governance,48
1,economic,35
2,immigration,29
3,polarizing,22
4,social and cultural,21
5,safety,20
6,health,19
7,establishment,19
8,foreign policy,17
9,environment,13


In [26]:
count_word(tx_captions_tagged)

Unnamed: 0,single_word,counts
0,law,30
1,border,26
2,values,12
3,washington,11
4,taxes,11
...,...,...
200,million jobs,1
201,fbi agents,1
202,federal overreach,1
203,finish the wall,1


In [27]:
tx_captions_tagged['advertiser_name'].value_counts()

Chad Crow                                            21
Beto for Texas                                       12
BRIXIUS                                              12
TEXANS FOR HENRY CUELLAR CONGRESSIONAL CAMPAIGN       8
Doctor Matt Rostami For Texas Campaign                7
C&B Printing Co Inc                                   6
JESSICA CISNEROS FOR CONGRESS                         5
John N. Raney                                         5
GREG CASAR FOR CONGRESS                               5
OKPA FOR CONGRESS                                     5
Hohman for Congress                                   3
SAHAK NALBANDYAN                                      3
J STREET                                              3
RESOURCE MEDIA A NONPROFIT CORPORATION                3
NRCC                                                  3
Allen West                                            2
MONICA DE LA CRUZ HERNANDEZ FOR US CONGRESS           2
MARKETFUEL SUBSCRIPTION SERVICES                

# NY

In [28]:
ny_video = get_youtube_id('GoogleAds/NY.csv')
ny_video_clean = check_video(ny_video)
ny_captions = get_captions(ny_video_clean)
ny_captions

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,3tbi7bkvn9g,True,these people actually hate america there's no...
1,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,2Xn9wTIo-AA,True,what is the state of washington dc your natio...
2,https://transparencyreport.google.com/politica...,Lost Debate Inc,≤ 10k,≤ 100,UfuAU9dJmvA,True,[Music] welcome to the lost debate a show for...
3,https://transparencyreport.google.com/politica...,ZELDIN FOR NEW YORK,≤ 10k,≤ 100,youtube_id not available: this ad violated goo...,False,\nCould not retrieve a transcript for the vide...
4,https://transparencyreport.google.com/politica...,Friends for Kathy Hochul,≤ 10k,≤ 100,UcQHDSvtl90,True,[Music] my dad started at a steel plant and t...
5,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,jG6NGdLUHeM,True,general michael flynn was unmasked at least 4...
6,https://transparencyreport.google.com/politica...,Friends for Kathy Hochul,≤ 10k,≤ 100,GwiWAmH9Pd4,True,i'm fighting to help families keep up with ri...
7,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,y68knSZLsdA,True,oh there we go hey there we go sam oh yeah he...
8,https://transparencyreport.google.com/politica...,Lost Debate Inc,≤ 10k,≤ 100,zyu4BUYbf4U,True,38 states have officially approved the bounda...
9,https://transparencyreport.google.com/politica...,Lost Debate Inc,≤ 10k,≤ 100,GsX_SUh-p7U,True,with all of the news about the pandemic and t...


In [29]:
remove_advertiser = ['Lost Debate Inc','TURNING POINT USA, NFP']
ny_captions = ny_captions[-ny_captions['advertiser_name'].isin(remove_advertiser)]
ny_captions

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
3,https://transparencyreport.google.com/politica...,ZELDIN FOR NEW YORK,≤ 10k,≤ 100,youtube_id not available: this ad violated goo...,False,\nCould not retrieve a transcript for the vide...
4,https://transparencyreport.google.com/politica...,Friends for Kathy Hochul,≤ 10k,≤ 100,UcQHDSvtl90,True,[Music] my dad started at a steel plant and t...
5,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,jG6NGdLUHeM,True,general michael flynn was unmasked at least 4...
6,https://transparencyreport.google.com/politica...,Friends for Kathy Hochul,≤ 10k,≤ 100,GwiWAmH9Pd4,True,i'm fighting to help families keep up with ri...
12,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders sum...
13,https://transparencyreport.google.com/politica...,Sabbath Lafleur,≤ 10k,≤ 100,PkHASyqBkRQ,True,as i speak this is the last day of january uh...
16,https://transparencyreport.google.com/politica...,Ned for CT,1M-10M,1k-50k,AbY0VFLLfJs,True,we have breaking news now it is official gove...
17,https://transparencyreport.google.com/politica...,TAXPAYERS FOR WILSON INC,100k-1M,1k-50k,TmkR9l5dWb0,True,this is johnstown the town i grew up in my da...
18,https://transparencyreport.google.com/politica...,MALONEY FOR CONGRESS,100k-1M,1k-50k,XyDh1Q28WdU,True,change doesn't come easy but it will come if ...
19,https://transparencyreport.google.com/politica...,MALONEY FOR CONGRESS,100k-1M,1k-50k,6UURc_GKLhI,True,my name is carolyn maloney i work for you


In [30]:
# remove unavialable videos
ny_captions_available = ny_captions[ny_captions['video_available'] == True]
ny_captions_clean = clean_text(ny_captions_available)
ny_captions_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['text'] = df_text['text'].astype(str).apply(lambda x: filter_text(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['text'] = df_text['text'].replace('/u0026', ' ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text.drop_duplicates(subset = 'text',keep = 'first', inplace = True)


Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,Friends for Kathy Hochul,≤ 10k,≤ 100,UcQHDSvtl90,True,music my dad started at a steel plant and taug...
1,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,jG6NGdLUHeM,True,general michael flynn was unmasked at least 48...
2,https://transparencyreport.google.com/politica...,Friends for Kathy Hochul,≤ 10k,≤ 100,GwiWAmH9Pd4,True,i m fighting to help families keep up with ris...
3,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders summ...
4,https://transparencyreport.google.com/politica...,Sabbath Lafleur,≤ 10k,≤ 100,PkHASyqBkRQ,True,as i speak this is the last day of january uh ...
5,https://transparencyreport.google.com/politica...,Ned for CT,1M-10M,1k-50k,AbY0VFLLfJs,True,we have breaking news now it is official gover...
6,https://transparencyreport.google.com/politica...,TAXPAYERS FOR WILSON INC,100k-1M,1k-50k,TmkR9l5dWb0,True,this is johnstown the town i grew up in my dad...
7,https://transparencyreport.google.com/politica...,MALONEY FOR CONGRESS,100k-1M,1k-50k,XyDh1Q28WdU,True,change doesn t come easy but it will come if y...
8,https://transparencyreport.google.com/politica...,MALONEY FOR CONGRESS,100k-1M,1k-50k,6UURc_GKLhI,True,my name is carolyn maloney i work for you
9,https://transparencyreport.google.com/politica...,REP. JOE MORELLE - U.S. HOUSE OF REPRESENTATIVES,100k-1M,1k-50k,ZcYpjMNoQy4,True,from ending surprise medical billing to making...


In [31]:
ny_captions_tagged = get_word_lable(ny_captions_clean,'lexicon.json')
ny_captions_tagged
ny_captions_tagged.to_csv('ny_captions_tagged.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['words'] = df_text['text'].astype(str).apply(lambda x: find_words(x,dic))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['m_label'] = df_text['words'].apply(lambda x: find_topic(x,dic))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['m_label'] = df_text['m_label'].apply(lamb

In [32]:
count_topic(ny_captions_tagged)

Unnamed: 0,single_topic,counts
0,economic,11
1,governance,10
2,social programs,8
3,no topic,6
4,safety,6
5,environment,5
6,education,4
7,health,4
8,polarizing,4
9,foreign policy,3


In [33]:
count_word(ny_captions_tagged)

Unnamed: 0,single_word,counts
0,taxes,6
1,no words,6
2,law,5
3,tax,5
4,liberal,3
...,...,...
83,michael brown,1
84,corruption,1
85,new deal,1
86,committing crimes,1


In [34]:
ny_captions_tagged['advertiser_name'].value_counts()

Friends for Kathy Hochul                            4
MALONEY FOR CONGRESS                                4
REP. JOE MORELLE - U.S. HOUSE OF REPRESENTATIVES    4
SAHAK NALBANDYAN                                    3
ZELDIN FOR NEW YORK                                 3
MARKETFUEL SUBSCRIPTION SERVICES                    2
Ned for CT                                          2
Alliance for Justice Action Campaign                2
Suraj Patel 2022                                    2
Sabbath Lafleur                                     1
TAXPAYERS FOR WILSON INC                            1
AMERICAN ACTION NETWORK                             1
1111 LLC                                            1
Republican State Leadership Committee - RSLC        1
Name: advertiser_name, dtype: int64

# CT

In [35]:
ct_video = get_youtube_id('GoogleAds/CT.csv')
ct_video_clean = check_video(ct_video)
ct_captions = get_captions(ct_video_clean)
ct_captions

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,youtube_id not available: this ad violated goo...,False,\nCould not retrieve a transcript for the vide...
1,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,Hczz2OL2oZE,True,foreign please rise for our national anthem o...
2,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,3tbi7bkvn9g,True,these people actually hate america there's no...
3,https://transparencyreport.google.com/politica...,NRSC,≤ 10k,≤ 100,5beJUEo67Q8,True,\nCould not retrieve a transcript for the vide...
4,https://transparencyreport.google.com/politica...,Lost Debate Inc,≤ 10k,≤ 100,sBJnbpNXRRQ,True,[Music] welcome to the lost debate a show for...
5,https://transparencyreport.google.com/politica...,Lost Debate Inc,≤ 10k,≤ 100,UfuAU9dJmvA,True,[Music] welcome to the lost debate a show for...
6,https://transparencyreport.google.com/politica...,Lost Debate Inc,≤ 10k,≤ 100,y-trUmtEIsM,True,[Music] welcome to the lost debate a show for...
7,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,y68knSZLsdA,True,oh there we go hey there we go sam oh yeah he...
8,https://transparencyreport.google.com/politica...,"TURNING POINT USA, NFP",≤ 10k,≤ 100,2Xn9wTIo-AA,True,what is the state of washington dc your natio...
9,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders sum...


In [36]:
remove_advertiser = ['Lost Debate Inc','TURNING POINT USA, NFP']
ct_captions = ct_captions[-ct_captions['advertiser_name'].isin(remove_advertiser)]
ct_captions

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
3,https://transparencyreport.google.com/politica...,NRSC,≤ 10k,≤ 100,5beJUEo67Q8,True,\nCould not retrieve a transcript for the vide...
9,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders sum...
10,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,fOCUqnSVIFY,True,if you've thought about going solar i want yo...
19,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,8Fj9aEnCZq0,True,attention homeowners if you have a meter like...
20,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,n8p9bcRbLxs,True,\nCould not retrieve a transcript for the vide...
22,https://transparencyreport.google.com/politica...,CT Truth PAC Inc.,1M-10M,1k-50k,quOQFzWj-NI,True,the fbi is investigating how the lamont admin...
23,https://transparencyreport.google.com/politica...,BOB FOR GOVERNOR INC,100k-1M,1k-50k,wKU7AnYiHZo,True,bob stefanowski an agenda that puts transpare...
24,https://transparencyreport.google.com/politica...,BOB FOR GOVERNOR INC,100k-1M,1k-50k,8zBTxobxS3E,True,bob stefanowski he's a fantastic listener and...
25,https://transparencyreport.google.com/politica...,MORE PERFECT UNION FOUNDATION,10k-100k,≤ 100,fiLusO7JopQ,True,what you're seeing around the country is alre...
26,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,jG6NGdLUHeM,True,general michael flynn was unmasked at least 4...


In [37]:
# remove unavialable videos
ct_captions_available = ct_captions[ct_captions['video_available'] == True]
ct_captions_clean = clean_text(ct_captions_available)
ct_captions_clean

Unnamed: 0,ad_url,advertiser_name,impressions,spend_usd,youtube_id,video_available,text
0,https://transparencyreport.google.com/politica...,NRSC,≤ 10k,≤ 100,5beJUEo67Q8,True,could not retrieve a transcript for the video ...
1,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,pzLwTOpn9mc,True,morgan president biden opened the leaders summ...
2,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,fOCUqnSVIFY,True,if you ve thought about going solar i want you...
3,https://transparencyreport.google.com/politica...,SAHAK NALBANDYAN,≤ 10k,≤ 100,8Fj9aEnCZq0,True,attention homeowners if you have a meter like ...
4,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,n8p9bcRbLxs,True,could not retrieve a transcript for the video ...
5,https://transparencyreport.google.com/politica...,CT Truth PAC Inc.,1M-10M,1k-50k,quOQFzWj-NI,True,the fbi is investigating how the lamont admini...
6,https://transparencyreport.google.com/politica...,BOB FOR GOVERNOR INC,100k-1M,1k-50k,wKU7AnYiHZo,True,bob stefanowski an agenda that puts transparen...
7,https://transparencyreport.google.com/politica...,BOB FOR GOVERNOR INC,100k-1M,1k-50k,8zBTxobxS3E,True,bob stefanowski he s a fantastic listener and ...
8,https://transparencyreport.google.com/politica...,MORE PERFECT UNION FOUNDATION,10k-100k,≤ 100,fiLusO7JopQ,True,what you re seeing around the country is alrea...
9,https://transparencyreport.google.com/politica...,MARKETFUEL SUBSCRIPTION SERVICES,≤ 10k,≤ 100,jG6NGdLUHeM,True,general michael flynn was unmasked at least 48...


In [38]:
ct_captions_tagged = get_word_lable(ct_captions_clean,'lexicon.json')
ct_captions_tagged.to_csv('ct_captions_tagged.csv')

In [39]:
count_topic(ct_captions_tagged)

Unnamed: 0,single_topic,counts
0,no topic,5
1,governance,4
2,economic,3
3,environment,3
4,foreign policy,2
5,health,2
6,polarizing,2
7,social programs,2
8,establishment,1
9,immigration,1


In [40]:
count_word(ct_captions_tagged)

Unnamed: 0,single_word,counts
0,no words,5
1,values,2
2,health,2
3,corruption,2
4,abortion,1
5,pollution,1
6,law,1
7,laws,1
8,nsa,1
9,out of pocket,1


In [41]:
ct_captions_tagged['advertiser_name'].value_counts()

SAHAK NALBANDYAN                    3
CT Truth PAC Inc.                   3
BOB FOR GOVERNOR INC                3
MARKETFUEL SUBSCRIPTION SERVICES    2
NRSC                                1
MORE PERFECT UNION FOUNDATION       1
1111 LLC                            1
Name: advertiser_name, dtype: int64