In [163]:
import numpy as np
import pandas as pd
import gzip
import json
from pathlib import Path

In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shanm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [169]:
import re
import string
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from unidecode import unidecode
from textblob import TextBlob

In [166]:
nlp = spacy.load('en_core_web_lg')

def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['rt']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [w for w in text_filtered if not bool(re.search(r'\w*\d\w*', w))]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [177]:
data_dirs = ['2020-11', '2020-12', '2021-01', '2021-02', '2021-12', '2022-01']
pd.set_option('display.max_colwidth', None)

In [147]:
# Cleaning
for data_dir in data_dirs:
    for path in Path('COVID-19-TweetIDs/' + data_dir).iterdir():
        if path.name.endswith('.jsonl.gz'):
            with gzip.open(path) as f:
                daytweets = pd.read_json(f, lines=True)
                # keep English tweets
                eng =  daytweets.loc[daytweets['lang'] == 'en']
                
                # drop every column except full_text
                engtext = daytweets[['full_text']].copy()
                
                # remove accounts mentioned, remove link, convert to ASCII
                engtext['full_text'] = engtext['full_text'].\
                map(lambda x: unidecode(' '.join(word for word in x.split(' ') \
                if not (word.startswith('@') or word.startswith('http')))))
                
                # lowercase, remove line breaks, remove punctuation, remove stopwords, 
                engtext['full_text'] = engtext['full_text'].apply(lambda x: clean_string(x, stem='Spacy'))
                
                # write to CSV
                csvpath = 'cleaned_data' + '\\' + data_dir + '\\' + str(path).split('\\')[2].split('.')[0] + '.csv'
                print(csvpath)
                engtext.to_csv(csvpath)

cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-00.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-01.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-02.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-03.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-04.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-05.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-06.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-07.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-08.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-09.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-10.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-11.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-12.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-13.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-14.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-01-15.csv
cleaned_data\2021-01\coronavirus-tweet-i

cleaned_data\2021-01\coronavirus-tweet-id-2021-01-06-17.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-06-18.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-06-19.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-06-20.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-06-21.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-06-22.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-06-23.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-00.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-01.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-02.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-03.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-04.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-05.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-06.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-07.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-07-08.csv
cleaned_data\2021-01\coronavirus-tweet-i

cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-10.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-11.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-12.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-13.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-14.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-15.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-16.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-17.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-18.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-19.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-20.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-21.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-22.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-12-23.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-13-00.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-13-01.csv
cleaned_data\2021-01\coronavirus-tweet-i

cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-03.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-04.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-05.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-06.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-07.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-08.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-09.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-10.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-11.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-12.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-13.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-14.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-15.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-16.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-17.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-18-18.csv
cleaned_data\2021-01\coronavirus-tweet-i

cleaned_data\2021-01\coronavirus-tweet-id-2021-01-23-20.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-23-21.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-23-22.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-23-23.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-00.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-01.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-02.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-03.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-04.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-05.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-06.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-07.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-08.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-09.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-10.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-24-11.csv
cleaned_data\2021-01\coronavirus-tweet-i

cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-13.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-14.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-15.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-16.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-17.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-18.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-19.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-20.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-21.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-22.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-29-23.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-30-00.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-30-01.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-30-02.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-30-03.csv
cleaned_data\2021-01\coronavirus-tweet-id-2021-01-30-04.csv
cleaned_data\2021-01\coronavirus-tweet-i

In [183]:
# Scores for raw
for data_dir in data_dirs:
    for path in Path('COVID-19-TweetIDs/' + data_dir).iterdir():
        if path.name.endswith('.jsonl.gz'):
            with gzip.open(path) as f:
                daytweets = pd.read_json(f, lines=True)
                
                # keep English tweets
                eng =  daytweets.loc[daytweets['lang'] == 'en']
                
                # drop every column except full_text
                engtext = daytweets[['full_text']].copy()
                
                # remove accounts mentioned, remove link, convert to ASCII
                engtext['full_text'] = engtext['full_text'].\
                map(lambda x: unidecode(' '.join(word for word in x.split(' ') \
                if not (word.startswith('@') or word.startswith('http')))))
                
                # remove line breaks and add sentiment score
                engtext['full_text'] = engtext.apply(lambda row: re.sub(r'\n', '', row['full_text']), axis = 1)
                engtext['sentiment_score'] = engtext.apply(lambda row: TextBlob(row['full_text']).sentiment, axis = 1)
                
                # write to CSV
                csvpath = 'raw_data_with_score' + '\\' + data_dir + '\\' + str(path).split('\\')[2].split('.')[0] + '.csv'
                print(csvpath)
                engtext.to_csv(csvpath)        
                break

KeyError: 'full_text'