In [4]:
import numpy as np
import pandas as pd
import gzip
import json
from pathlib import Path

In [27]:
import nltk
nltk.download('stopwords')
import re
import string
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from unidecode import unidecode
from textblob import TextBlob

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shanmukh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
nlp = spacy.load('en_core_web_lg')

def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['rt']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [w for w in text_filtered if not bool(re.search(r'\w*\d\w*', w))]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [28]:
data_dirs = ['2020-11', '2020-12', '2021-01', '2021-02', '2021-12', '2022-01']
pd.set_option('display.max_colwidth', None)

# Cleaning with Spacy, adding score using TextBlob

In [None]:
for data_dir in data_dirs:
    for path in Path('raw_data/' + data_dir).iterdir():
        if path.name.endswith('.jsonl.gz'):
            with gzip.open(path) as f:
                daytweets = pd.read_json(f, lines=True)
                # keep English tweets
                eng =  daytweets.loc[daytweets['lang'] == 'en']
                
                # drop every column except created_at and full_text
                engtext = daytweets[['created_at', 'full_text']].copy()
                
                # remove accounts mentioned, remove link, convert to ASCII
                engtext['full_text'] = engtext['full_text'].\
                map(lambda x: unidecode(' '.join(word for word in x.split(' ') \
                if not (word.startswith('@') or word.startswith('http')))))
                
                # lowercase, remove line breaks, remove punctuation, remove stopwords, 
                engtext['full_text'] = engtext['full_text'].apply(lambda x: clean_string(x, stem='Spacy'))
                
                engtext['sentiment_score'] = engtext.apply(lambda row: TextBlob(str(row['full_text'])).sentiment.polarity, axis = 1)
                
                # write to CSV
                Path("spacy_textblob/" + data_dir).mkdir(parents=True, exist_ok=True)
                csvpath = 'spacy_textblob' + '/' + data_dir + '/' + str(path).split('/')[2].split('.')[0] + '.csv'
                engtext.to_csv(csvpath)

# Not cleaning, adding score using Vader

In [97]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [104]:
vaderanalyzer = SentimentIntensityAnalyzer()

for data_dir in data_dirs:
    for path in Path('raw_data/' + data_dir).iterdir():
        if path.name.endswith('.jsonl.gz'):
            with gzip.open(path) as f:
                if not (str(path) == 'raw_data/2020-12/coronavirus-tweet-id-2020-12-31-11.jsonl.gz' or str(path) == 'raw_data/2021-01/coronavirus-tweet-id-2021-01-23-11.jsonl.gz'):
                    continue
                print(path)
                daytweets = pd.read_json(f, lines=True)
                # keep English tweets
                eng =  daytweets.loc[daytweets['lang'] == 'en']
                
                # drop every column except created_at and full_text
                engtext = daytweets[['created_at', 'full_text']].copy()
                
                # remove accounts mentioned, remove link, convert to ASCII
                engtext['full_text'] = engtext['full_text'].\
                map(lambda x: unidecode(' '.join(word for word in x.split(' ') \
                if not (word.startswith('@') or word.startswith('http')))))
                
                # # lowercase, remove line breaks, remove punctuation, remove stopwords, 
                # engtext['full_text'] = engtext['full_text'].apply(lambda x: clean_string(x, stem='Spacy'))
                
                engtext['sentiment_score'] = engtext.apply(lambda row: vaderanalyzer.polarity_scores(row['full_text']), axis = 1)
                
                # write to CSV
                Path("raw_vader/" + data_dir).mkdir(parents=True, exist_ok=True)
                jsonpath = 'raw_vader' + '/' + data_dir + '/' + str(path).split('/')[2].split('.')[0] + '.jsonl'
                engtext.to_json(jsonpath, orient='records', lines=True)

raw_data/2020-12/coronavirus-tweet-id-2020-12-31-11.jsonl.gz
raw_data/2021-01/coronavirus-tweet-id-2021-01-23-11.jsonl.gz


# Not cleaning, adding score using roBERTa

In [23]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [95]:
from transformers import pipeline
sentiment_task = pipeline("sentiment-analysis", model=MODEL, tokenizer=MODEL)

for data_dir in data_dirs:
    for path in Path('raw_data/' + data_dir).iterdir():
        if path.name.endswith('.jsonl.gz'):
            with gzip.open(path) as f:
                # if not (str(path) == 'raw_data/2020-12/coronavirus-tweet-id-2020-12-31-11.jsonl.gz' or str(path) == 'raw_data/2021-01/coronavirus-tweet-id-2021-01-23-11.jsonl.gz'):
                #     continue
                # print(path)
                daytweets = pd.read_json(f, lines=True)
                # keep English tweets
                eng =  daytweets.loc[daytweets['lang'] == 'en']
                
                # drop every column except created_at and full_text
                engtext = daytweets[['created_at', 'full_text']].copy()
                
                # remove accounts mentioned, remove link, convert to ASCII
                engtext['full_text'] = engtext['full_text'].\
                map(lambda x: unidecode(' '.join(word for word in x.split(' ') \
                if not (word.startswith('@') or word.startswith('http')))))
                
                engtext['sentiment_score'] = engtext.apply(lambda row: sentiment_task(row['full_text']), axis = 1)
                
                # write to CSV
                Path("raw_roBERTa/" + data_dir).mkdir(parents=True, exist_ok=True)
                jsonpath = 'raw_roBERTa' + '/' + data_dir + '/' + str(path).split('/')[2].split('.')[0] + '.jsonl'
                engtext.to_json(jsonpath, orient='records', lines=True)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


raw_data/2020-12/coronavirus-tweet-id-2020-12-31-11.jsonl.gz
raw_data/2021-01/coronavirus-tweet-id-2021-01-23-11.jsonl.gz


In [101]:
# Finding faulty CSVs
faulty_paths = []
for data_dir in data_dirs:
    for path in Path('raw_roBERTa/' + data_dir).iterdir():
        if path.name.endswith('Store') or path.name.endswith('jsonl'):
            continue
        df = pd.read_csv(path)
        lis = df['created_at'].tolist()
        try:
            res = True in (ele[0] != '2' for ele in lis)
            if res == True:
                faulty_paths.append(path)
        except:
            faulty_paths.append(path)

for data_dir in data_dirs:
    for path in Path('spacy_textblob/' + data_dir).iterdir():
        if path.name.endswith('Store') or path.name.endswith('jsonl'):
            continue
        df = pd.read_csv(path)
        lis = df['created_at'].tolist()
        try:
            res = True in (ele[0] != '2' for ele in lis)
            if res == True:
                faulty_paths.append(path)
        except:
            faulty_paths.append(path)

for data_dir in data_dirs:
    for path in Path('raw_vader/' + data_dir).iterdir():
        if path.name.endswith('Store') or path.name.endswith('jsonl'):
            continue
        df = pd.read_csv(path)
        lis = df['created_at'].tolist()
        try:
            res = True in (ele[0] != '2' for ele in lis)
            if res == True:
                faulty_paths.append(path)
        except:
            faulty_paths.append(path)

In [102]:
print(faulty_paths)

[PosixPath('raw_vader/2020-12/coronavirus-tweet-id-2020-12-31-11.csv'), PosixPath('raw_vader/2021-01/coronavirus-tweet-id-2021-01-23-11.csv')]


In [103]:
import os
for path in faulty_paths:
    os.remove(path)