In [1]:
import numpy as np
import pandas as pd
import gzip
import json
from pathlib import Path

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shanmukh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import re
import string
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from unidecode import unidecode
from textblob import TextBlob

In [4]:
nlp = spacy.load('en_core_web_lg')

def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['rt']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [w for w in text_filtered if not bool(re.search(r'\w*\d\w*', w))]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [5]:
data_dirs = ['2020-11', '2020-12', '2021-01', '2021-02', '2021-12', '2022-01']
pd.set_option('display.max_colwidth', None)

In [7]:
# Cleaning with Spacy and adding score using TextBlob
for data_dir in data_dirs:
    for path in Path('raw_data/' + data_dir).iterdir():
        if path.name.endswith('.jsonl.gz'):
            with gzip.open(path) as f:
                daytweets = pd.read_json(f, lines=True)
                # keep English tweets
                eng =  daytweets.loc[daytweets['lang'] == 'en']
                
                # drop every column except created_at and full_text
                engtext = daytweets[['created_at', 'full_text']].copy()
                
                # remove accounts mentioned, remove link, convert to ASCII
                engtext['full_text'] = engtext['full_text'].\
                map(lambda x: unidecode(' '.join(word for word in x.split(' ') \
                if not (word.startswith('@') or word.startswith('http')))))
                
                # lowercase, remove line breaks, remove punctuation, remove stopwords, 
                engtext['full_text'] = engtext['full_text'].apply(lambda x: clean_string(x, stem='Spacy'))
                
                engtext['sentiment_score'] = engtext.apply(lambda row: TextBlob(str(row['full_text'])).sentiment.polarity, axis = 1)
                
                # write to CSV
                Path("spacy_textblob/" + data_dir).mkdir(parents=True, exist_ok=True)
                csvpath = 'spacy_textblob' + '/' + data_dir + '/' + str(path).split('/')[2].split('.')[0] + '.csv'
                engtext.to_csv(csvpath)

In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [21]:
vaderanalyzer = SentimentIntensityAnalyzer()
# Not cleaning and adding score using Vader
for data_dir in data_dirs:
    for path in Path('raw_data/' + data_dir).iterdir():
        if path.name.endswith('.jsonl.gz'):
            with gzip.open(path) as f:
                daytweets = pd.read_json(f, lines=True)
                # keep English tweets
                eng =  daytweets.loc[daytweets['lang'] == 'en']
                
                # drop every column except created_at and full_text
                engtext = daytweets[['created_at', 'full_text']].copy()
                
                # remove accounts mentioned, remove link, convert to ASCII
                engtext['full_text'] = engtext['full_text'].\
                map(lambda x: unidecode(' '.join(word for word in x.split(' ') \
                if not (word.startswith('@') or word.startswith('http')))))
                
                # # lowercase, remove line breaks, remove punctuation, remove stopwords, 
                # engtext['full_text'] = engtext['full_text'].apply(lambda x: clean_string(x, stem='Spacy'))
                
                engtext['sentiment_score'] = engtext.apply(lambda row: vaderanalyzer.polarity_scores(row['full_text']), axis = 1)
                
                # write to CSV
                Path("raw_vader/" + data_dir).mkdir(parents=True, exist_ok=True)
                csvpath = 'raw_vader' + '/' + data_dir + '/' + str(path).split('/')[2].split('.')[0] + '.csv'
                engtext.to_csv(csvpath)