In [1]:
import pandas as pd
import os
from os import path
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from deep_translator import GoogleTranslator
from deep_translator.exceptions import TooManyRequests
import logging
import sys
from time import perf_counter
from logging import warning, info, error
import glob

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
print("Hello VADER!")

Hello VADER!


In [11]:
filename = path.join("data-sets", "examples", "hydrated", "output2020_02_sm.csv")
df = pd.read_csv(filename)

df.head(2).full_text

0    Wereldwijd groeien de zorgen om het #coronavir...
1    Of de aantallen kloppen niet. Of dit filmpje i...
Name: full_text, dtype: object

In [14]:
def classify_sentiment(sentence):
    """
    This function accepts a string and 
    """
    sid = SentimentIntensityAnalyzer()

    sentiment_dict = sid.polarity_scores(sentence)
    print(f"Overall sentiment dictionary is : {sentiment_dict}")
    print(f"sentence was rated as {sentiment_dict['neg']*100} % Negative")
    print(f"sentence was rated as {sentiment_dict['neu']*100} % Neutral")
    print(f"sentence was rated as {sentiment_dict['pos']*100} % Positive")
    if sentiment_dict["compound"] >= 0.05:
        print("Positive")
    elif sentiment_dict["compound"] <= -0.05:
        print("Negative")
    else:
        print("Neutral")

In [21]:
# Example with one sentence
sentence = "Wereldwijd groeien de zorgen om het #coronavirus. Ruim 11.000 mensen zijn inmiddels besmet in China, van wie er 258 zijn overleden."

classify_sentiment(sentence)

Overall sentiment dictionary is : {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
sentence was rated as 0.0 % Negative
sentence was rated as 100.0 % Neutral
sentence was rated as 0.0 % Positive
Neutral


In [28]:
stop_words = set(stopwords.words("english"))


def remove_urls(sentence):
    """
    This function removes urls from a String.
    """
    return re.sub(r"https?://\S*","",sentence,flags=re.MULTILINE)


def remove_symbols(sentence):
    """
    This function removes symbols from a String.
    """
    return re.sub("[#,.!?:]", "",sentence)


def remove_stopwords(sentence):
    """
    This function removes english stopwords from a String.
    """
    word_tokens = word_tokenize(sentence)
    filtered_tokens = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = " ".join(filtered_tokens)
    return filtered_sentence


sentence = "This is sentence is about #coronavirus and this news articale published at: https://stackabuse.com/python-convert-list-to-string/"

filtered_sentence = remove_urls(sentence)
print(filtered_sentence)
filtered_sentence = remove_symbols(filtered_sentence)
print(filtered_sentence)
filtered_sentence = remove_stopwords(filtered_sentence)
print(filtered_sentence)

This is sentence is about #coronavirus and this news articale published at: 
This is sentence is about coronavirus and this news articale published at 
sentence coronavirus news articale published


In [32]:
filename = path.join("data-sets", "examples", "hydrated", "output2020_02_sm.csv")

nl_to_en = GoogleTranslator(source='nl', target='en')

stop_words = set(stopwords.words("english"))

sid = SentimentIntensityAnalyzer()

df = pd.read_csv(filename)

df["processed_text"] = df["full_text"].apply(lambda txt: nl_to_en.translate(txt))

df["processed_text"] = df["processed_text"].apply(lambda txt: txt.lower())

df["processed_text"] = df["processed_text"].apply(lambda txt: " ".join([word for word in txt.split() if word not in stop_words]))

df["scores"] = df["processed_text"].apply(lambda txt: sid.polarity_scores(txt))

df

Unnamed: 0,created_at,id,id_str,full_text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,scopes,withheld_copyright,withheld_in_countries,withheld_scope,geo,contributors,display_text_range,quoted_status_permalink,processed_text,scores
0,Sat Feb 01 00:14:00 +0000 2020,1223399059123208192,1223399059123208192,Wereldwijd groeien de zorgen om het #coronavir...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,"[0, 155]",,"concerns #coronavirus growing worldwide. 11,00...","{'neg': 0.412, 'neu': 0.485, 'pos': 0.103, 'co..."
1,Sun Feb 02 08:10:37 +0000 2020,1223881392607715328,1223881392607715328,Of de aantallen kloppen niet. Of dit filmpje i...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,"[0, 126]",,numbers wrong. video real. something megaloman...,"{'neg': 0.237, 'neu': 0.763, 'pos': 0.0, 'comp..."
2,Sun Feb 02 15:45:21 +0000 2020,1223995832548188160,1223995832548188160,Nederlanders uit Wuhan naar vliegbasis Eindhov...,"<a href=""https://zapier.com/"" rel=""nofollow"">Z...",False,,,,,...,,,,,,,"[0, 81]",,dutch people wuhan eindhoven air base #capelle...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,Sun Feb 02 20:35:29 +0000 2020,1224068845381607424,1224068845381607424,triest dat hen die vrijdden ook vaak een virus...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,"[0, 110]",,sad make love also often spread virus...eeh sp...,"{'neg': 0.329, 'neu': 0.419, 'pos': 0.251, 'co..."
4,Sun Feb 02 23:30:47 +0000 2020,1224112960953516032,1224112960953516032,Een Deep State laboratorium medewerker besmet ...,"<a href=""http://twitter.com/download/iphone"" r...",False,1.224109e+18,1.224109e+18,3368052000.0,3368052000.0,...,,,,,,,"[0, 125]",,deep state lab worker infected biological weap...,"{'neg': 0.495, 'neu': 0.505, 'pos': 0.0, 'comp..."
5,Mon Feb 03 06:37:39 +0000 2020,1224220383651516416,1224220383651516416,@aguiarjuanma @todonoticias Todos los aviones ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,1.224149e+18,1.224149e+18,823734300.0,823734300.0,...,,,,,,,"[28, 63]",,@aguiarjuanma @todonoticias todos los aviones ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
6,Tue Feb 04 08:48:55 +0000 2020,1224615809412104192,1224615809412104192,? Erster Coronavirus-Toter in Hongkong – Pfleg...,"<a href=""https://projectguide.org"" rel=""nofoll...",False,,,,,...,,,,,,,"[0, 124]",,? erster coronavirus toter hong kong – pfleger...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
7,Wed Feb 05 14:29:42 +0000 2020,1225063958031360000,1225063958031360000,"Lees ""Doorgaan Grote Prijs van China in Formul...","<a href=""https://nieuwsblik.nl"" rel=""nofollow""...",False,,,,,...,,,,,,,"[0, 146]",,"read ""continue grand prix china formula 1 unce...","{'neg': 0.121, 'neu': 0.714, 'pos': 0.165, 'co..."
8,Thu Feb 06 08:55:22 +0000 2020,1225342208196435968,1225342208196435968,F1-directeur verwacht dat coronavirus tot uits...,"<a href=""http://dailygp.com"" rel=""nofollow"">Da...",False,,,,,...,,,,,,,"[0, 99]",,f1 director expects coronavirus postpone race ...,"{'neg': 0.192, 'neu': 0.808, 'pos': 0.0, 'comp..."
9,Sat Feb 08 01:33:30 +0000 2020,1225955782883192832,1225955782883192832,Wat gebeurt er in China? https://t.co/CK7PxqrDmO,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,...,,,,,,,"[0, 24]","{'url': 'https://t.co/CK7PxqrDmO', 'expanded':...",what's happening china? https://t.co/ck7pxqrdmo,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [10]:
def classify_dataset_for_sentiment(dataset_path, overwrite_cache=True):
    """
    @param: dataset_path
    """
    if not os.path.exists(dataset_path) and not os.path.isdir(dataset_path):
        raise FileNotFoundError("File 'dataset_path' does not exist or is not a directory.")

    parentdir = os.path.dirname(dataset_path)
    # create a path to directory that is a sibling of 'dirpath' variable.
    processeddir = os.path.join(parentdir, "processed")
    # create a directory called 'processed' for the processed dataset if it does not yet exist.
    if not os.path.exists(processeddir):
        os.mkdir(processeddir)
        info(f"Created directory: {processeddir}.")

    filenames = [file for file in os.listdir(dataset_path) if file.endswith(".csv")]
    for file in filenames:
        filepath = os.path.join(dataset_path, file)
        processed_filepath = os.path.join(processeddir, file)
        if not overwrite_cache and os.path.exists(processed_filepath):
            info(f"File already exists in: {processed_filepath}; skipping.")
            continue

        df = pd.read_csv(filepath)
        df = classify_sentiment(df, column_name="processed_text")
        df.to_csv(processed_filepath)
        info(f"Created new processed file in: {processed_filepath}.")

    return True


# Only instantiate once.
sid = SentimentIntensityAnalyzer()


def classify_sentiment(df, column_name):
    """
    """
    df["polarity_scores"] = df[column_name].apply(lambda txt: dict(sid.polarity_scores(txt)))
    return df

In [14]:
df = pd.read_csv("data-sets/examples/processed/output2020_02_sm.csv",
                 index_col="id",
                 dtype={"id": "int64"},
                 parse_dates=["created_at"])

df = classify_sentiment(df, column_name="processed_texts")

df.polarity_scores.values[0]["neg"]

0.239

In [3]:
def read_hydrated_csv(filename):
    df = pd.read_csv(filename,
                     index_col="id",
                     usecols=["id", "full_text", "created_at"],
                     dtype={"id": "int64"},
                     parse_dates=["created_at"])
    return df

In [21]:
dataset_path = os.path.join("data-sets",
                            "Lopez1",
                            "hydrated",
                            "*.csv")

df = pd.read_csv(dataset_path,
                 index_col="id",
                 usecols=["id", "full_text", "created_at"],
                 dtype={"id": "int64"},
                 parse_dates=["created_at"])

df

FileNotFoundError: [Errno 2] No such file or directory: 'data-sets/Lopez1/hydrated/*.csv'

In [12]:
dataset_path = os.path.join("data-sets",
                            "Lopez1",
                            "hydrated",
                            "*.csv")

filenames = glob.glob(dataset_path)


df = None
for filename in filenames:

    if df is None:
        df = read_hydrated_csv(filename)
        print(f"Created df starting with file: {filename}.")
    else:
        df = df.append(read_hydrated_csv(filename))
        print(f"Appended df with file: {filename}.")

df

Created df starting with file: data-sets/Lopez1/hydrated/output2021_01.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2021_03.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2021_05.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2020_09.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2021_02.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2021_07.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2020_08.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2020_07.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2020_11.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2020_02.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2021_04.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2020_05.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2021_10.csv.
Appended df with file: data-sets/Lopez1/hydrated/output2020_12.csv.
Appended df with file: data-sets/Lopez1/

Unnamed: 0_level_0,created_at,full_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1345299235424628736,2021-01-02 09:21:46+00:00,Den deze is zo zot als een #helfie achterdeur ...
1346355633960980480,2021-01-05 07:19:31+00:00,GELD GELD GELD!!!\nDe Covid vaccins draaien om...
1346870290882711552,2021-01-06 17:24:35+00:00,@Rapidrobbie69 @vandeambulance Je ziet de elle...
1347277253147844608,2021-01-07 20:21:43+00:00,Praktijk Ruggespraak sluit zich aan bij Landel...
1347476921777614848,2021-01-08 09:35:07+00:00,Toont houding @vvd belangen bepalen #pandemies...
...,...,...
1405050783352111104,2021-06-16 06:33:05+00:00,@hugodejonge Moet u en @MinVWS niet transparan...
1402962164332253184,2021-06-10 12:13:40+00:00,RT @fangshimin: 浙江瑞安湖岭发现一个从意大利返乡人员被新冠感染，连夜封路、全...
1401480853021134848,2021-06-06 10:07:28+00:00,@smoldeputybean Stop acting like you don’t know!
1400708377383145472,2021-06-04 06:57:55+00:00,RT @crystaal__cast: I miss these friendships lol


In [13]:
df.sort_values("created_at")

Unnamed: 0_level_0,created_at,full_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1223397541212905472,2020-02-01 00:07:58+00:00,"Jan 2020, Taal, Wuhan, Brexit, WisdomToothExit..."
1223397812752150528,2020-02-01 00:09:02+00:00,Behahaha 👍👍😭😭😭 https://t.co/v1i1MBEsy6
1223397854921682944,2020-02-01 00:09:12+00:00,2020年、令和最初の初日の出を見るならココ!!\nとにかく美しい日の出スポット。\n都心と...
1223397888031625216,2020-02-01 00:09:20+00:00,Zelfs #LaGomera 😳 https://t.co/m1of3yn9ua
1223397900740386816,2020-02-01 00:09:23+00:00,"Lees ""Meer dan 250 doden door nieuw coronaviru..."
...,...,...
1464382501414219776,2021-11-26 23:56:09+00:00,"@RexChapman This gives me the ""feels"" wonderful."
1464382772592750592,2021-11-26 23:57:14+00:00,"Wederom een mislukte editie, waar we als kijke..."
1464382838934061056,2021-11-26 23:57:30+00:00,@Cultuurfilosoof @JJ_Almekinders En kinderen m...
1464383115502174208,2021-11-26 23:58:36+00:00,De leugens ontmaskerd. De schofterige onbeleze...


In [7]:
output2020_02 = os.path.join("data-sets",
                              "Lopez1",
                              "hydrated",
                              "output2020_02.csv")


df = pd.read_csv(output2020_02,
                 index_col="id",
                 usecols=["id", "full_text", "created_at"],
                 dtype={"id": "int64"},
                 parse_dates=["created_at"])

output2020_03 = os.path.join("data-sets",
                              "Lopez1",
                              "hydrated",
                              "output2020_03.csv")

df = df.append(pd.read_csv(output2020_02,
                           index_col="id",
                           usecols=["id", "full_text", "created_at"],
                           dtype={"id": "int64"},
                           parse_dates=["created_at"]))

output2020_04 = os.path.join("data-sets",
                             "Lopez1",
                             "hydrated",
                             "output2020_04.csv")

df = df.append(read_hydrated_csv(output2020_04))


output2021_02 = os.path.join("data-sets",
                             "Lopez1",
                             "hydrated",
                             "output2021_02.csv")

df = df.append(read_hydrated_csv(output2021_02))

print(len(df))

output2021_01 = os.path.join("data-sets",
                             "Lopez1",
                             "hydrated",
                             "output2021_01.csv")

df = df.append(read_hydrated_csv(output2021_01))

len(df)

190351


352301

In [30]:
pd.read_csv("data-sets/Lopez1/hydrated/output2021_02.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,created_at,id,id_str,full_text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,matching_rules,current_user_retweet,scopes,withheld_copyright,withheld_in_countries,withheld_scope,geo,contributors,display_text_range,quoted_status_permalink
0,Mon Feb 01 09:48:19 +0000 2021,1356177551195049984,1356177551195049984,"@anetverdonk @FransBrom Tot je zelf 66 bent, o...","<a href=""http://twitter.com/download/iphone"" r...",False,1.355974e+18,1.355974e+18,5.996231e+08,5.996231e+08,...,,,,,,,,,"[24, 162]",
1,Mon Feb 01 10:48:34 +0000 2021,1356192714757468160,1356192714757468160,@NOS Dit wist iedereen die wakker was al sinds...,"<a href=""http://twitter.com/download/android"" ...",False,1.356187e+18,1.356187e+18,7.174972e+06,7.174972e+06,...,,,,,,,,,"[5, 264]",
2,Mon Feb 01 14:56:23 +0000 2021,1356255078349500416,1356255078349500416,@howardkomproe Corona gedoogbeleid,"<a href=""http://twitter.com/download/android"" ...",False,1.356227e+18,1.356227e+18,3.971281e+07,3.971281e+07,...,,,,,,,,,"[15, 34]",
3,Mon Feb 01 16:28:41 +0000 2021,1356278308560896000,1356278308560896000,"Lockdown wordt met drie weken verlengd, nieuwe...","<a href=""https://about.twitter.com/products/tw...",False,,,,,...,,,,,,,,,"[0, 93]",
4,Tue Feb 02 07:26:04 +0000 2021,1356504143045808128,1356504143045808128,RT @maniarinban: @T_Thamizarasan சங்கி தமிழரசன...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,,"[0, 78]",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103537,Sat Feb 06 10:07:55 +0000 2021,1357994421598498816,1357994421598498816,"@hiddevanderlouw Beste Hidde, ik heb zelf doch...","<a href=""http://twitter.com/download/android"" ...",False,1.357954e+18,1.357954e+18,5.302855e+07,5.302855e+07,...,,,,,,,,,"[17, 252]",
103538,Sat Feb 06 14:48:54 +0000 2021,1358065133864517632,1358065133864517632,@Est1818 @Onair58 @robertjensen @harryhol In j...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,1.358064e+18,1.358064e+18,3.407834e+09,3.407834e+09,...,,,,,,,,,"[42, 321]",
103539,Sat Feb 13 08:44:28 +0000 2021,1360510138562588672,1360510138562588672,@OmerVandevelde Toch wel...vrouw van een colle...,"<a href=""http://twitter.com/#!/download/ipad"" ...",False,1.359978e+18,1.359978e+18,1.268266e+18,1.268266e+18,...,,,,,,,,,"[16, 242]",
103540,Tue Feb 23 19:22:34 +0000 2021,1364294597246144512,1364294597246144512,@meulmart @Marc_y_marc1 @malchir @ELOTTELINI @...,"<a href=""http://twitter.com/#!/download/ipad"" ...",False,,,,,...,,,,,,,,,"[60, 272]",


In [37]:
filename = "data-sets/Lopez1/hydrated/output2021_01.csv"

for skip_blank_lines in [True, False]:
    print(f"Skipping blank lines: {skip_blank_lines}")

    df = pd.read_csv(filename, skip_blank_lines=skip_blank_lines)
    print(f"Row count: {len(df)}.")
    print(f"Unique values: {df[df.columns[1]].unique()}.")

Skipping blank lines: True
Row count: 161950.
Unique values: [1345299235424628736 1346355633960980480 1346870290882711552 ...
 1345699763371565056 1354352374811422720 1355863770669658112].
Skipping blank lines: False
Row count: 161950.
Unique values: [1345299235424628736 1346355633960980480 1346870290882711552 ...
 1345699763371565056 1354352374811422720 1355863770669658112].
