https://edition.cnn.com/


https://www.cnbc.com/weather-and-natural-disasters/
https://www.euronews.com/tag/natural-disaster

https://www.goodnewsnetwork.org/

In [161]:
# 
import os
import csv

# importing libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import os


# Selenium
from selenium import webdriver   # for webdriver
from selenium.webdriver.support.ui import WebDriverWait  # for implicit and explict waits
from selenium.webdriver.chrome.options import Options  # for suppressing the browser


#nlp
import nltk
from nltk.corpus import stopwords
from string import punctuation
import unicodedata
import contractions
STOPWORDS = set(stopwords.words('english'))


#regular expressions
import re

# pretrained (sentiment classification)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# textBlob 
from textblob import TextBlob

# flair
import flair

# loop status viewer
from tqdm import tqdm

# tools
import matplotlib.pyplot as plt
import random


# train test split
from sklearn.model_selection import train_test_split


# tensorflow libraries to train new classification model
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

# parallel computing
from joblib import parallel_backend


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/salvado/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# set the driver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

In [5]:
def goodnews_scrap(web='https://www.goodnewsnetwork.org/'):
    driver.get(web)
    soup_goodnews = BeautifulSoup(driver.page_source)
    
    article_goodnews = []
    news_list = soup_goodnews.find_all('div', class_='td-block-row')
    for news in news_list:
        for header in news.find_all('h3'):
            article_goodnews.append(header.get_text())
    return article_goodnews
    

In [6]:
def cnn_scrap(web='https://edition.cnn.com/'):
    driver.get(web)
    soup_cnn = BeautifulSoup(driver.page_source)
    article_cnn = []
    for section in soup_cnn.find_all('section')[1:]:
        if len(section.find_all('ul')) > 0:
            for ul_elem in section.find_all('ul'):
                for elem in ul_elem.find_all('li'):
                    article_cnn.append(elem.get_text())
    return article_cnn

In [7]:
articles_goodnews = goodnews_scrap()
articles_cnn = cnn_scrap()

<h3>Text preprocessing:</h3>
<ul>
    <li>remove accented characters from strings use UniDecode;</li>
    <li>remove punctuation;</li>
    <li>make everyword lowercase;</li>
    <li>if present: remove the \n \t, etc from the strings;</li>
    <li>try remove stopwords;</li>
</ul>

In [108]:
def text_preprocessing(text: str, stop_word=True)-> str:
    """
    Function to preprocess text.
    """
    tmp = []
    for word in text.split():
        if word in list(contractions.contractions_dict.keys()):
            tmp.append(contractions.contractions_dict[word])
        elif re.findall(r"[A-Za-z]+"r"'"+r"[A-Za-z]", word):
            split_at = re.search(r"[']", word).span()[0]
            word = word[0:split_at]
            tmp.append(word)
        elif re.findall(r"[a-zA-z0-9]+"r"-"+r"[a-zA-z0-9]", word):
            l =re.split(r'-', word)
            tmp.extend(l)
        else:
            tmp.append(word)
    text = ' '.join(tmp)
    text = text.translate(str.maketrans('','', punctuation))  # remove punctutations
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    if stop_word:
        text = [word for word in text.split() if word not in STOPWORDS]
        text = ' '.join(text)
    
    
    
    return text

In [176]:
def classify_text_vader(articles: list)-> pd.DataFrame:
    DS = SentimentIntensityAnalyzer()
    articles_ = []
    for article in articles:
        text_original_classified = DS.polarity_scores(article)
        text_processed = text_preprocessing(article)
        text_processed_classified = DS.polarity_scores(text_processed)
        
        articles_.append([article, text_original_classified['compound'], text_processed, text_processed_classified['compound']])
        
    return pd.DataFrame(articles_, columns=['Original', 'Original Sent', 'Processed', 'Processed Sent'])

In [156]:
def classify_text_Texblob(articles: list)-> pd.DataFrame:
    articles_ = []
    for article in articles:
        text_original_classified = TextBlob(article).sentiment
        text_processed = text_preprocessing(article)
        text_processed_classified = TextBlob(text_processed).sentiment
        
        articles_.append([article, text_original_classified, text_processed, text_processed_classified])
        
    return pd.DataFrame(articles_, columns=['Original', 'Original Sent', 'Processed', 'Processed Sent'])

In [177]:
articles_goodnews_df = classify_text_vader(articles_goodnews)
articles_goodnews_df

Unnamed: 0,Original,Original Sent,Processed,Processed Sent
0,"Good News in History, February 16",0.4404,good news history february 16,0.4404
1,Lack of Financial Literacy in Schools Inspires...,0.7506,lack financial literacy schools inspires launc...,0.7506
2,Old Batteries from Electric Vehicles Get New L...,0.0,old batteries electric vehicles get new life p...,0.0
3,"23-Year-Old Rows Solo 3,000 Miles Across Atlan...",0.0,23 year old rows solo 3000 miles across atlant...,0.0
4,"An Army of 10,000 Women Saved India’s Rarest S...",0.6369,army 10000 women saved indias rarest stork giv...,0.6369
5,World’s Known Lithium Reserves Up 40% After Co...,0.0,worlds known lithium reserves 40 colossal disc...,0.0
6,Yale Honors Work of 9-Year-Old Girl Who is Sto...,0.5106,yale honors work 9 year old girl stomping extr...,0.5106
7,"Flood of ‘Right to Repair’ Bills For Autos, Ph...",0.0,flood right repair bills autos phones tractors...,0.0
8,Researchers Successfully Turn Abandoned Oil We...,0.3182,researchers successfully turn abandoned oil we...,0.3182
9,"Biggest Coin Hoard in a Decade Worth $180,000 ...",0.2263,biggest coin hoard decade worth 180000 discove...,0.2263


In [178]:
articles_cnn_df = classify_text_vader(articles_cnn)
articles_cnn_df

Unnamed: 0,Original,Original Sent,Processed,Processed Sent
0,West's hardest task in Ukraine: Convincing Put...,-0.0258,west hardest task ukraine convincing putin los...,0.2732
1,live: Russia launches 'massive missile attack'...,-0.4767,live russia launches massive missile attack uk...,-0.4767
2,Lukashenko says he will not send troops to Ukr...,-0.4588,lukashenko says send troops ukraine unless bel...,-0.4588
3,Russian mothers send Putin a message about the...,0.0,russian mothers send putin message sons,0.0
4,On the ground: CNN reports from Belarus,0.0,ground cnn reports belarus,0.0
5,Video appears to show Ukrainians destroying we...,-0.8885,video appears show ukrainians destroying weapo...,-0.8885
6,Andrew Tate's Muslim fanbase is growing. Some ...,-0.296,andrew tate muslim fanbase growing say exploit...,-0.296
7,What 'China's Hawaii' has to do with alleged s...,0.0,hawaii alleged spy balloons across globe,0.0
8,Two miners found dead after ground collapses b...,-0.7579,two miners found dead ground collapses beneath,-0.7579
9,Haley makes first speech as candidate,0.0,haley makes first speech candidate,0.0


In [157]:
articles_goodnews_df2 = classify_text_Texblob(articles_goodnews)
articles_goodnews_df2

Unnamed: 0,Original,Original Sent,Processed,Processed Sent
0,"Good News in History, February 16","(0.7, 0.6000000000000001)",good news history february 16,"(0.7, 0.6000000000000001)"
1,Lack of Financial Literacy in Schools Inspires...,"(0.0, 0.0)",lack financial literacy schools inspires launc...,"(0.0, 0.0)"
2,Old Batteries from Electric Vehicles Get New L...,"(0.11818181818181818, 0.32727272727272727)",old batteries electric vehicles get new life p...,"(0.11818181818181818, 0.32727272727272727)"
3,"23-Year-Old Rows Solo 3,000 Miles Across Atlan...","(0.0, 0.16666666666666666)",23 year old rows solo 3000 miles across atlant...,"(0.05, 0.18333333333333335)"
4,"An Army of 10,000 Women Saved India’s Rarest S...","(0.005681818181818177, 0.4147727272727273)",army 10000 women saved indias rarest stork giv...,"(0.13636363636363635, 0.45454545454545453)"
5,World’s Known Lithium Reserves Up 40% After Co...,"(0.3, 0.8)",worlds known lithium reserves 40 colossal disc...,"(0.3, 0.8)"
6,Yale Honors Work of 9-Year-Old Girl Who is Sto...,"(0.005681818181818177, 0.7272727272727273)",yale honors work 9 year old girl stomping extr...,"(0.03712121212121212, 0.5515151515151515)"
7,"Flood of ‘Right to Repair’ Bills For Autos, Ph...","(0.2857142857142857, 0.5357142857142857)",flood right repair bills autos phones tractors...,"(0.2857142857142857, 0.5357142857142857)"
8,Researchers Successfully Turn Abandoned Oil We...,"(0.375, 0.975)",researchers successfully turn abandoned oil we...,"(0.375, 0.975)"
9,"Biggest Coin Hoard in a Decade Worth $180,000 ...","(0.15, 0.05)",biggest coin hoard decade worth 180000 discove...,"(0.15, 0.05)"


In [174]:
texto = 'It was a good time'

TextBlob(texto).sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

In [175]:
DS = SentimentIntensityAnalyzer()
DS.polarity_scores(texto)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [None]:
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')
s = flair.data.Sentence(sentence)
flair_sentiment.predict(s)
total_sentiment = s.labels
total_sentiment