# Discovering insights from VP Duterte speeches


## Setup the environment


In [1]:
import datetime
from collections import Counter

import pandas as pd
import spacy
import calamancy

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
en_nlp = spacy.load("en_core_web_md")
tl_nlp = calamancy.load("tl_calamancy_md-0.1.0")



## Dataset preparation

In [3]:
df = pd.read_csv("speeches.csv")
df.head()

Unnamed: 0,title,link,date,content
0,ISO Recertification of the Office of the Vice ...,https://www.ovp.gov.ph/post/iso-recertificatio...,2024-01-15,Office of the Vice President Mandaluyong City ...
1,VPSD Speech on Go Negosyo's 18th anniversary,https://www.ovp.gov.ph/post/vpsd-speech-go-neg...,2023-11-27,Assalamualaikum… Madayaw ug maayong gabii kani...
2,VPSD Speech for National Children's Month Culm...,https://www.ovp.gov.ph/post/vpsd-speech-nation...,2023-11-27,"November 27, 2023, at 2:00 PM Ladies and gentl..."
3,VPSD Speech for the Gift-Giving in Binalonan,https://www.ovp.gov.ph/post/vpsd-speech-gift-g...,2023-11-25,"Gift-giving to 1,500 receipients Date of Event..."
4,VP Sara speech for the Renaming of Agham Road ...,https://www.ovp.gov.ph/post/vp-sara-speech-for...,2023-11-17,"Quezon City Mga Kababayan,Ladies and Gentlemen..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    54 non-null     object
 1   link     54 non-null     object
 2   date     54 non-null     object
 3   content  54 non-null     object
dtypes: object(4)
memory usage: 1.8+ KB


## Explorative Data Analysis

In [5]:
# Tokenization using spaCy English model

en_tokens = []

for speech in df["content"]:
    doc = en_nlp(speech)
    tokens = [
        token.text
        for token in doc
        if not token.is_punct and not token.is_space and not token.is_stop
    ]
    en_tokens.extend(tokens)

In [6]:
Counter(en_tokens).most_common(10)

[('sa', 777),
 ('na', 403),
 ('ang', 357),
 ('mga', 333),
 ('ng', 319),
 ('education', 217),
 ('po', 196),
 ('learners', 154),
 ('Education', 143),
 ('children', 127)]

In [7]:
# Tokenization using calamanCy Tagalog model

tl_tokens = []

for speech in df["content"]:
    doc = tl_nlp(speech)
    tokens = [
        token.text
        for token in doc
        if not token.is_punct and not token.is_space and not token.is_stop
    ]
    tl_tokens.extend(tokens)

In [8]:
Counter(tl_tokens).most_common(10)

[('the', 2467),
 ('and', 1962),
 ('of', 1678),
 ('to', 1510),
 ('in', 819),
 ('our', 739),
 ('a', 660),
 ('that', 601),
 ('for', 544),
 ('we', 461)]

In [9]:
# FIXME: The speeches use both Filipino and English. When tokenizing using the
# English model, it is not able to handle the Filipino words properly. For
# instance, the English model cannot detect Filipino stopwords.