In [1]:
import pandas as pd
import os
from os import path
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from deep_translator import GoogleTranslator
from deep_translator.exceptions import TooManyRequests
import logging
import sys
from time import perf_counter

In [2]:
"""
Enable simple logging in Jupyter notebook.
"""
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

log = logging.getLogger("senti_tweet")
log.setLevel(logging.INFO)

log.info("This is some info!")
log.warning("This is a warning!")
log.error("This is a error!")

INFO:senti_tweet:This is some info!
ERROR:senti_tweet:This is a error!


In [3]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
print("Hello VADER!")

Hello VADER!


In [11]:
filename = path.join("data-sets", "examples", "hydrated", "output2020_02_sm.csv")
df = pd.read_csv(filename)

df.head(2).full_text

0    Wereldwijd groeien de zorgen om het #coronavir...
1    Of de aantallen kloppen niet. Of dit filmpje i...
Name: full_text, dtype: object

In [14]:
def classify_sentiment(sentence):
    """
    This function accepts a string and 
    """
    sid = SentimentIntensityAnalyzer()

    sentiment_dict = sid.polarity_scores(sentence)
    print(f"Overall sentiment dictionary is : {sentiment_dict}")
    print(f"sentence was rated as {sentiment_dict['neg']*100} % Negative")
    print(f"sentence was rated as {sentiment_dict['neu']*100} % Neutral")
    print(f"sentence was rated as {sentiment_dict['pos']*100} % Positive")
    if sentiment_dict["compound"] >= 0.05:
        print("Positive")
    elif sentiment_dict["compound"] <= -0.05:
        print("Negative")
    else:
        print("Neutral")

In [21]:
# Example with one sentence
sentence = "Wereldwijd groeien de zorgen om het #coronavirus. Ruim 11.000 mensen zijn inmiddels besmet in China, van wie er 258 zijn overleden."

classify_sentiment(sentence)

Overall sentiment dictionary is : {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sentence was rated as 0.0 % Negative
sentence was rated as 100.0 % Neutral
sentence was rated as 0.0 % Positive
Neutral


In [28]:
stop_words = set(stopwords.words("english"))


def remove_urls(sentence):
    """
    This function removes urls from a String.
    """
    return re.sub(r"https?://\S*","",sentence,flags=re.MULTILINE)


def remove_symbols(sentence):
    """
    This function removes symbols from a String.
    """
    return re.sub("[#,.!?:]", "",sentence)


def remove_stopwords(sentence):
    """
    This function removes english stopwords from a String.
    """
    word_tokens = word_tokenize(sentence)
    filtered_tokens = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = " ".join(filtered_tokens)
    return filtered_sentence


sentence = "This is sentence is about #coronavirus and this news articale published at: https://stackabuse.com/python-convert-list-to-string/"

filtered_sentence = remove_urls(sentence)
print(filtered_sentence)
filtered_sentence = remove_symbols(filtered_sentence)
print(filtered_sentence)
filtered_sentence = remove_stopwords(filtered_sentence)
print(filtered_sentence)

This is sentence is about #coronavirus and this news articale published at: 
This is sentence is about coronavirus and this news articale published at 
sentence coronavirus news articale published


In [32]:
filename = path.join("data-sets", "examples", "hydrated", "output2020_02_sm.csv")

nl_to_en = GoogleTranslator(source='nl', target='en')

stop_words = set(stopwords.words("english"))

sid = SentimentIntensityAnalyzer()

df = pd.read_csv(filename)

df["processed_text"] = df["full_text"].apply(lambda txt: nl_to_en.translate(txt))

df["processed_text"] = df["processed_text"].apply(lambda txt: txt.lower())

df["processed_text"] = df["processed_text"].apply(lambda txt: " ".join([word for word in txt.split() if word not in stop_words]))

df["scores"] = df["processed_text"].apply(lambda txt: sid.polarity_scores(txt))

df

Unnamed: 0,created_at,id,id_str,full_text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,scopes,withheld_copyright,withheld_in_countries,withheld_scope,geo,contributors,display_text_range,quoted_status_permalink,processed_text,scores
0,Sat Feb 01 00:14:00 +0000 2020,1223399059123208192,1223399059123208192,Wereldwijd groeien de zorgen om het #coronavir...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,"[0, 155]",,"concerns #coronavirus growing worldwide. 11,00...","{'neg': 0.412, 'neu': 0.485, 'pos': 0.103, 'co..."
1,Sun Feb 02 08:10:37 +0000 2020,1223881392607715328,1223881392607715328,Of de aantallen kloppen niet. Of dit filmpje i...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,"[0, 126]",,numbers wrong. video real. something megaloman...,"{'neg': 0.237, 'neu': 0.763, 'pos': 0.0, 'comp..."
2,Sun Feb 02 15:45:21 +0000 2020,1223995832548188160,1223995832548188160,Nederlanders uit Wuhan naar vliegbasis Eindhov...,"<a href=""https://zapier.com/"" rel=""nofollow"">Z...",False,,,,,...,,,,,,,"[0, 81]",,dutch people wuhan eindhoven air base #capelle...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,Sun Feb 02 20:35:29 +0000 2020,1224068845381607424,1224068845381607424,triest dat hen die vrijdden ook vaak een virus...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,"[0, 110]",,sad make love also often spread virus...eeh sp...,"{'neg': 0.329, 'neu': 0.419, 'pos': 0.251, 'co..."
4,Sun Feb 02 23:30:47 +0000 2020,1224112960953516032,1224112960953516032,Een Deep State laboratorium medewerker besmet ...,"<a href=""http://twitter.com/download/iphone"" r...",False,1.224109e+18,1.224109e+18,3368052000.0,3368052000.0,...,,,,,,,"[0, 125]",,deep state lab worker infected biological weap...,"{'neg': 0.495, 'neu': 0.505, 'pos': 0.0, 'comp..."
5,Mon Feb 03 06:37:39 +0000 2020,1224220383651516416,1224220383651516416,@aguiarjuanma @todonoticias Todos los aviones ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,1.224149e+18,1.224149e+18,823734300.0,823734300.0,...,,,,,,,"[28, 63]",,@aguiarjuanma @todonoticias todos los aviones ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
6,Tue Feb 04 08:48:55 +0000 2020,1224615809412104192,1224615809412104192,? Erster Coronavirus-Toter in Hongkong – Pfleg...,"<a href=""https://projectguide.org"" rel=""nofoll...",False,,,,,...,,,,,,,"[0, 124]",,? erster coronavirus toter hong kong – pfleger...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
7,Wed Feb 05 14:29:42 +0000 2020,1225063958031360000,1225063958031360000,"Lees ""Doorgaan Grote Prijs van China in Formul...","<a href=""https://nieuwsblik.nl"" rel=""nofollow""...",False,,,,,...,,,,,,,"[0, 146]",,"read ""continue grand prix china formula 1 unce...","{'neg': 0.121, 'neu': 0.714, 'pos': 0.165, 'co..."
8,Thu Feb 06 08:55:22 +0000 2020,1225342208196435968,1225342208196435968,F1-directeur verwacht dat coronavirus tot uits...,"<a href=""http://dailygp.com"" rel=""nofollow"">Da...",False,,,,,...,,,,,,,"[0, 99]",,f1 director expects coronavirus postpone race ...,"{'neg': 0.192, 'neu': 0.808, 'pos': 0.0, 'comp..."
9,Sat Feb 08 01:33:30 +0000 2020,1225955782883192832,1225955782883192832,Wat gebeurt er in China? https://t.co/CK7PxqrDmO,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,...,,,,,,,"[0, 24]","{'url': 'https://t.co/CK7PxqrDmO', 'expanded':...",what's happening china? https://t.co/ck7pxqrdmo,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [49]:
filename = path.join("data-sets", "examples", "hydrated", "output2020_02_sm.csv")

nl_to_en = GoogleTranslator(source='auto', target='en')

df = pd.read_csv(filename)

texts = set(df["full_text"])


try:
    start = perf_counter()
    translated_texts = nl_to_en.translate_batch(texts)
    elapsed_time = perf_counter()-start
    log.info(f"Translation took: {elapsed_time:.2f} seconds.")
except TooManyRequests as e:
    log.warning(f"Error caught: {e.message}")

se = pd.Series(translated_texts)

df["processed_texts"] = se.values

df

INFO:senti_tweet:Translation took: 1.28 seconds.


Unnamed: 0,created_at,id,id_str,full_text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,current_user_retweet,scopes,withheld_copyright,withheld_in_countries,withheld_scope,geo,contributors,display_text_range,quoted_status_permalink,processed_texts
0,Sat Feb 01 00:14:00 +0000 2020,1223399059123208192,1223399059123208192,Wereldwijd groeien de zorgen om het #coronavir...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,"[0, 155]",,No time for moorkop Pieten and racism and clim...
1,Sun Feb 02 08:10:37 +0000 2020,1223881392607715328,1223881392607715328,Of de aantallen kloppen niet. Of dit filmpje i...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,"[0, 126]",,"""Chinese approach to coronavirus hides a lot o..."
2,Sun Feb 02 15:45:21 +0000 2020,1223995832548188160,1223995832548188160,Nederlanders uit Wuhan naar vliegbasis Eindhov...,"<a href=""https://zapier.com/"" rel=""nofollow"">Z...",False,,,,,...,,,,,,,,"[0, 81]",,Concerns about the #coronavirus are growing wo...
3,Sun Feb 02 20:35:29 +0000 2020,1224068845381607424,1224068845381607424,triest dat hen die vrijdden ook vaak een virus...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,"[0, 110]",,Dutch people from Wuhan to Eindhoven Air Base ...
4,Sun Feb 02 23:30:47 +0000 2020,1224112960953516032,1224112960953516032,Een Deep State laboratorium medewerker besmet ...,"<a href=""http://twitter.com/download/iphone"" r...",False,1.224109e+18,1.224109e+18,3368052000.0,3368052000.0,...,,,,,,,,"[0, 125]",,@aguiarjuanma @todonoticias All planes have en...
5,Mon Feb 03 06:37:39 +0000 2020,1224220383651516416,1224220383651516416,@aguiarjuanma @todonoticias Todos los aviones ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,1.224149e+18,1.224149e+18,823734300.0,823734300.0,...,,,,,,,,"[28, 63]",,"Read ""Continue Grand Prix of China in Formula ..."
6,Tue Feb 04 08:48:55 +0000 2020,1224615809412104192,1224615809412104192,? Erster Coronavirus-Toter in Hongkong – Pfleg...,"<a href=""https://projectguide.org"" rel=""nofoll...",False,,,,,...,,,,,,,,"[0, 124]",,? First coronavirus death in Hong Kong - carer...
7,Wed Feb 05 14:29:42 +0000 2020,1225063958031360000,1225063958031360000,"Lees ""Doorgaan Grote Prijs van China in Formul...","<a href=""https://nieuwsblik.nl"" rel=""nofollow""...",False,,,,,...,,,,,,,,"[0, 146]",,sad that those who make love also often spread...
8,Thu Feb 06 08:55:22 +0000 2020,1225342208196435968,1225342208196435968,F1-directeur verwacht dat coronavirus tot uits...,"<a href=""http://dailygp.com"" rel=""nofollow"">Da...",False,,,,,...,,,,,,,,"[0, 99]",,Or the numbers are wrong. Or this video is not...
9,Sat Feb 08 01:33:30 +0000 2020,1225955782883192832,1225955782883192832,Wat gebeurt er in China? https://t.co/CK7PxqrDmO,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,...,,,,,,,,"[0, 24]","{'url': 'https://t.co/CK7PxqrDmO', 'expanded':...",A Deep State Lab Worker Infected With Biologic...
