In [1]:
import pandas as pd
import numpy as np

import re
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
from langdetect import detect

from collections import Counter

from tqdm.notebook import tqdm
import joblib

from presidio_analyzer import AnalyzerEngine

pd.set_option('display.max_columns', None)

2023-08-27 10:40:57.775719: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-27 10:40:59.816092: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-08-27 10:40:59.816163: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: jupyter-pestrada
2023-08-27 10:40:59.816174: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: jupyter-pestrada
2023-08-27 10:40:59.816289: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 515.65.1
2023-08-27 10:40:59.816319: I tensorflow/compiler/xla/stream_executor/cuda/cuda_di

# Loading and cleaning the dataset

In [2]:
fp = '../data/2023-rappler-articles-ns.csv'
df = pd.read_csv(fp)

  df = pd.read_csv(fp)


In [3]:
df.shape

(36215, 60)

In [4]:
business_df = df[df['link'].str.contains('/business/')].copy()
business_df.shape

(3997, 60)

In [5]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    cleaned_text = soup.get_text()
    return cleaned_text

business_df['content.cleaned'] = business_df['content.rendered'].apply(remove_html_tags)

In [6]:
pattern = r'[\t\n]|<[\w "@+#%&,./:;=?_-]+>'
pattern2 = r'<.*https?.*>'
pattern3 = r'<(?:a|span|div|em|strong|mark).*>'
pattern4 = r'!function.*;'
pattern5 = r'\.igframe.*}'

business_df['content.cleaned'] = (
    business_df['content.cleaned']
    .apply(unidecode)
    .apply(lambda x: re.sub(pattern, ' ', x))
    .apply(lambda x: re.sub(pattern2, ' ', x))
    .apply(lambda x: re.sub(pattern3, ' ', x))
    .apply(lambda x: re.sub(pattern4, ' ', x))
    .apply(lambda x: re.sub(pattern5, ' ', x))
    .str.replace('&#8217;', "'")
    .str.replace('&#8220;', '"')
    .str.replace('&#8221;', '"')
)

# NER
## Tagging unique entities per article

In [7]:
analyzer = AnalyzerEngine()

content_list = business_df['content.cleaned'].tolist()

named_entities_list = []
for content in tqdm(content_list):
    analysis_results = analyzer.analyze(content, language='en')
    named_entities = [content[ent.start:ent.end]
                      for ent in analysis_results 
                      if ent.entity_type == 'PERSON']
    
    named_entities_list.append(named_entities)

  0%|          | 0/3997 [00:00<?, ?it/s]

In [8]:
joblib.dump(named_entities_list, 'named_entities_list.pkl')

['named_entities_list.pkl']

In [9]:
named_entities_list[0:5]

[[],
 ['Eric Peter Roxas'],
 ['James Patrick Cruz'],
 ['Soccsksargen',
  'Edgar Galvante',
  'Rodrigo Duterte',
  'Driver',
  'Lance Spencer Yu/Rappler.com  '],
 ["Ferdinand Marcos Jr's"]]

In [10]:
unique_entities = [set(x) for x in named_entities_list]

In [11]:
usecols=[
    'id',
    'link',
    'date',
    'content.rendered',
    'content.cleaned',
    'categories',
    'authorship',
    'tags'
]

z = business_df[usecols].copy()
z.loc[:, 'unique_entities'] = unique_entities

In [12]:
z.head()

Unnamed: 0,id,link,date,content.rendered,content.cleaned,categories,authorship,tags,unique_entities
26,2452896,https://www.rappler.com/business/list-flights-...,2023-07-24T12:43:49,"\n<p>MANILA, Philippines – Various airports an...","MANILA, Philippines - Various airports and ai...","[621, 624]",[6936],"[2283, 2487, 2802]",{}
34,2452759,https://www.rappler.com/business/repower-energ...,2023-07-24T10:44:42,"\n<p>MANILA, Philippines – Repower Energy Deve...","MANILA, Philippines - Repower Energy Developm...","[621, 625, 626]",[4952],"[2716, 2520, 2266]",{Eric Peter Roxas}
111,2449638,https://www.rappler.com/business/coa-finds-ant...,2023-07-22T14:42:06,"\n<p>MANILA, Philippines – The Commission on A...","MANILA, Philippines - The Commission on Audit...","[621, 622]",[4999],"[73, 1964]",{James Patrick Cruz}
127,2451576,https://www.rappler.com/business/land-transpor...,2023-07-22T11:20:47,"\n<p>MANILA, Philippines – More than 66% of dr...","MANILA, Philippines - More than 66% of drivin...","[621, 622]",[4999],"[1964, 2103]","{Driver, Rodrigo Duterte, Soccsksargen, Lance ..."
146,2451174,https://www.rappler.com/business/sim-card-regi...,2023-07-21T19:56:01,"\n<p>MANILA, Philippines– The Department of In...","MANILA, Philippines- The Department of Inform...","[621, 628, 622]",[4999],"[1991, 2494]",{Ferdinand Marcos Jr's}


## Cleaning named entities

In [13]:
def scrape_author_info(article_url):
    response = requests.get(article_url)
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'html.parser')

        author_element = soup.find('a', class_='post-single__author')
        
        if author_element:
            author_name = author_element.get_text(strip=True)
            return author_name
        else:
            return np.nan
    else:
        return np.nan

In [14]:
an_list = list()

for url in tqdm(z['link'].tolist()):
    auth_name = scrape_author_info(url)
    an_list.append(auth_name)
    
z['author.name'] = an_list
z.head()

  0%|          | 0/3997 [00:00<?, ?it/s]

Unnamed: 0,id,link,date,content.rendered,content.cleaned,categories,authorship,tags,unique_entities,author.name
26,2452896,https://www.rappler.com/business/list-flights-...,2023-07-24T12:43:49,"\n<p>MANILA, Philippines – Various airports an...","MANILA, Philippines - Various airports and ai...","[621, 624]",[6936],"[2283, 2487, 2802]",{},Lance Spencer Yu
34,2452759,https://www.rappler.com/business/repower-energ...,2023-07-24T10:44:42,"\n<p>MANILA, Philippines – Repower Energy Deve...","MANILA, Philippines - Repower Energy Developm...","[621, 625, 626]",[4952],"[2716, 2520, 2266]",{Eric Peter Roxas},Ralf Rivas
111,2449638,https://www.rappler.com/business/coa-finds-ant...,2023-07-22T14:42:06,"\n<p>MANILA, Philippines – The Commission on A...","MANILA, Philippines - The Commission on Audit...","[621, 622]",[4999],"[73, 1964]",{James Patrick Cruz},
127,2451576,https://www.rappler.com/business/land-transpor...,2023-07-22T11:20:47,"\n<p>MANILA, Philippines – More than 66% of dr...","MANILA, Philippines - More than 66% of drivin...","[621, 622]",[4999],"[1964, 2103]","{Driver, Rodrigo Duterte, Soccsksargen, Lance ...",
146,2451174,https://www.rappler.com/business/sim-card-regi...,2023-07-21T19:56:01,"\n<p>MANILA, Philippines– The Department of In...","MANILA, Philippines- The Department of Inform...","[621, 628, 622]",[4999],"[1991, 2494]",{Ferdinand Marcos Jr's},


In [16]:
text = business_df['content.cleaned'].iloc[3]

def regex_author_name(text):
    pattern = r'-\s?([A-z ]+/)?Rappler.com'
    matches = re.findall(pattern, text)

    if matches:
        captured_name = matches[0]
        return captured_name
    else:
        return None

# regex_author_name(text)

In [17]:
# business_df['content.cleaned'].apply(regex_author_name).unique()

In [18]:
known_authors = list(set(['James Patrick Cruz', 'Lance Spencer Yu', 'Dennis Abrina',
                'Lance Yu', 'Martha Teodoro', 'Jodesz Gavilan', 'Joann Manabat',
                'Chris Burnet Ramos', 'Michelle Abad', 'Lorenz Pasion', 
                'Ralf Rivas', 'Bea Cupin', 'Bonz Magsambol', 'Lian Buan',
                'Sofia Tomacruz', 'Eirenne Lumasang', 'Ryan Macasero'] +
                 z['author.name'].dropna().unique().tolist()))

known_authors = known_authors + [f'{n}/Rappler.com' for n in known_authors]

In [19]:
z['unique_entities'] = [
    [y for y in x if y not in known_authors]
    for x in z['unique_entities']
]

In [20]:
rel_z = z[z['unique_entities'].apply(len)>1].copy()

In [21]:
# Counter(rel_z['unique_entities'].explode())

In [22]:
cleaned_entities = [
    [y for y in x if len(y.split()) > 1]
    for x in rel_z['unique_entities']
]

In [23]:
tl_entities = []

for x in tqdm(cleaned_entities):
    filtered_list = []
    for y in x:
        try:
            if detect(y) == 'tl' and y not in known_authors:
                tl_entities.append(y)
        except:
            pass  
    tl_entities.append(filtered_list)

  0%|          | 0/3426 [00:00<?, ?it/s]

In [24]:
# print(sorted(tl_entities, key=lambda x: len(x)))

In [25]:
err_tl = {'2137/2138 Manila-Bacolod',
        'Ko Lang',
        'Ang\nIn',
        'Ang Batang Quiapo',
        'Ayala Land',
        'Ayala Land Premier',
        'Ayala Malls',
        'Bahay Ko Program'
        'Eat Bulaga',
        'Bantay Presyo',
        'Bayan Muna',
        'Batasang Pambansa',
        'Gising Pilipinas',
        'Lingkod Kapamilya',
        'Magandang Buhay',
        'Magandang Gabi Bayan',
        'Samahang Manggagawa',
        'Sa Kongreso',
        'Swiss Singapore',
        'Tahanang Walang',
        'Walang Personalan'}

names_tl = set([x for x in tl_entities if x!=[] and x.istitle()])

names_tl = names_tl.difference(err_tl)

tl_entities = set([x for x in tl_entities if x!=[]])
tl_phrases = tl_entities.difference(names_tl)

In [26]:
# freaking_list
# 'para ma-maintain',
# 'ka magbi-bid',
# 'DZMM TeleRadyo',
# 'Isyu Spotted',
# 'Kaya Mo',

In [27]:
cleaned_entities = [
    [y for y in x if y not in tl_phrases]
    for x in cleaned_entities
]

In [29]:
cleaned_entities = [
    [
        (
            re.sub('\s[A-Z]\. ',
                   '',
                   y.replace('-',' ').replace("’s", '').strip())
        )
        .replace('.', '')
        for y in x
        if y not in tl_phrases
    ]
    for x in cleaned_entities
    
]

# Sentiment Scoring

In [31]:
rel_z.head()

Unnamed: 0,id,link,date,content.rendered,content.cleaned,categories,authorship,tags,unique_entities,author.name
127,2451576,https://www.rappler.com/business/land-transpor...,2023-07-22T11:20:47,"\n<p>MANILA, Philippines – More than 66% of dr...","MANILA, Philippines - More than 66% of drivin...","[621, 622]",[4999],"[1964, 2103]","[Driver, Rodrigo Duterte, Soccsksargen, Lance ...",
189,2450537,https://www.rappler.com/business/how-to-get-sp...,2023-07-21T11:09:26,"\n<p>MANILA, Philippines – If you’ve dreamt ab...","MANILA, Philippines - If you've dreamt about ...","[621, 625]",[6936],"[2460, 1222, 18533]","[P365,394.24, Edgar Injap]",Lance Spencer Yu
192,2450569,https://www.rappler.com/business/updates-globa...,2023-07-21T09:55:00,"\n<p>NEW YORK, USA – The dollar rallied and a ...","NEW YORK, USA - The dollar rallied and a gaug...","[621, 626]",[5083],"[2625, 2181]","[Graham, Daleep Singh, Tayyip Erdogan, Kazuo U...",
194,2450555,https://www.rappler.com/business/india-imposes...,2023-07-21T09:30:00,"\n<p>NEW DELHI, India – India on Thursday, Jul...","NEW DELHI, India - India on Thursday, July 20...","[621, 622, 624]",[5083],"[2531, 2513, 1140]","[Haryana, Narendra Modi, Rice, Rao]",
219,2450113,https://www.rappler.com/business/security-bank...,2023-07-20T14:38:32,"\n<p>MANILA, Philippines – Some <a href=""https...","MANILA, Philippines - Some Security Bank cust...","[621, 625, 626]",[6936],"[1450, 1566]","[Tanya Ansaldo-Deakin, Rappler]",Lance Spencer Yu


In [37]:
from textblob import TextBlob

def ave_sentiment(text):
    blob = TextBlob(text)
    scores = list()
    for sentence in blob.sentences:
        scores.append(sentence.sentiment.polarity)
    return np.mean(scores)

In [38]:
rel_z['sentiment_score'] = rel_z['content.cleaned'].apply(ave_sentiment)

rel_z.head()

Unnamed: 0,id,link,date,content.rendered,content.cleaned,categories,authorship,tags,unique_entities,author.name,sentiment_score
127,2451576,https://www.rappler.com/business/land-transpor...,2023-07-22T11:20:47,"\n<p>MANILA, Philippines – More than 66% of dr...","MANILA, Philippines - More than 66% of drivin...","[621, 622]",[4999],"[1964, 2103]","[Driver, Rodrigo Duterte, Soccsksargen, Lance ...",,0.053582
189,2450537,https://www.rappler.com/business/how-to-get-sp...,2023-07-21T11:09:26,"\n<p>MANILA, Philippines – If you’ve dreamt ab...","MANILA, Philippines - If you've dreamt about ...","[621, 625]",[6936],"[2460, 1222, 18533]","[P365,394.24, Edgar Injap]",Lance Spencer Yu,0.113437
192,2450569,https://www.rappler.com/business/updates-globa...,2023-07-21T09:55:00,"\n<p>NEW YORK, USA – The dollar rallied and a ...","NEW YORK, USA - The dollar rallied and a gaug...","[621, 626]",[5083],"[2625, 2181]","[Graham, Daleep Singh, Tayyip Erdogan, Kazuo U...",,0.032747
194,2450555,https://www.rappler.com/business/india-imposes...,2023-07-21T09:30:00,"\n<p>NEW DELHI, India – India on Thursday, Jul...","NEW DELHI, India - India on Thursday, July 20...","[621, 622, 624]",[5083],"[2531, 2513, 1140]","[Haryana, Narendra Modi, Rice, Rao]",,-0.001325
219,2450113,https://www.rappler.com/business/security-bank...,2023-07-20T14:38:32,"\n<p>MANILA, Philippines – Some <a href=""https...","MANILA, Philippines - Some Security Bank cust...","[621, 625, 626]",[6936],"[1450, 1566]","[Tanya Ansaldo-Deakin, Rappler]",Lance Spencer Yu,-0.007


In [39]:
# rel_z.to_csv('../data/rappler-business-with-ner-wip-sentiment.csv', index=False)