In [1]:
#!pip install gensim --quiet

import pandas as pd
import gensim
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rhais\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rhais\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rhais\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
train = pd.read_csv('Corona_NLP_train.csv',encoding = 'latin1')
test = pd.read_csv('Corona_NLP_test.csv')

In [3]:
train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
test.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


I want to perform my own train-test-split so I will concatenate these two dataframes and split them later.

In [5]:
twitter = pd.concat([train, test])
twitter.shape

(44955, 6)

The only columns I am concerned with for now are "OrignalTweet" and "Sentiment," so I will only check those two columns for missing values

In [6]:
for col in ['OriginalTweet', 'Sentiment']:
    print(twitter[col].isna().mean())


0.0
0.0


There are no nulls in these columns. Now, I split the dataframe into features and labels

In [7]:
X = twitter['OriginalTweet']

y = twitter['Sentiment']

In [8]:
y.value_counts()

Positive              12369
Negative              10958
Neutral                8332
Extremely Positive     7223
Extremely Negative     6073
Name: Sentiment, dtype: int64

At this point, I have a long string with all the tweet text and a list of labels. I need to tokenize each doc to be a list of words.

In [9]:
def preprocess(docs):
    
    '''drops the affixes of words, word tokenizes,
    makes lowercase, and removes stopwords and punctuation
    returns a list of lemmas for each doc'''
    
    lemmatizer = WordNetLemmatizer()
    preprocessed = []
    
    for doc in docs:
        words = word_tokenize(doc)
        cleaned = [lemmatizer.lemmatize(word.lower())
                  for word in words 
                   if word.lower() not in stopwords.words('english')
                   if word.isalpha()]
        untokenized = ' '.join(cleaned)
        preprocessed.append(untokenized)

    return preprocessed

In [10]:
# This step can take quite a bit of time
processed = preprocess(X)

Now, we can convert the cleaned tweets into vectors. I will use TfidfVectorizer.

In [11]:
vectorizer = TfidfVectorizer(
    max_df=0.5, min_df=2, use_idf=True, norm=u'l2', smooth_idf=True)


# Applying the vectorizer
vectors = vectorizer.fit_transform(processed)

tfidf_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())


Now that the data are cleaned, I will divide the dataset into training and test groups

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y, test_size = 0.2, random_state = 21, stratify = y)

In [None]:
rf = RandomForestClassifier(n_estimators = 20, max_depth = 10)

rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
confusion_df = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    index=["True Ex. Neg.", "True Ex. Pos.", "True Neg.", "True Neut.", "True Pos."],
    columns=["Predicted Ex. Neg.", "Predicted Ex. Pos.", "Predicted Neg.", "Predicted Neut.", "Predicted Pos."],
)

confusion_df

The model was good at predicting extreme and neutral sentiments, but struggled with positive and negative.

If I had more computing power, I would also investigate a GradientBoostingClassifier. However, this model proved to be too much for my CPU.