In [1]:
import sklearn
from sklearn.decomposition import NMF
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/spencersheen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stopwords_set = set(stopwords.words('english'))

## 1.1 Generate Data Matrix (BOW)

In [4]:
train_df = pd.read_csv("./Corona_NLP_train.csv", 
                       encoding = "ISO-8859-1")
test_df  = pd.read_csv("./Corona_NLP_test.csv", 
                       encoding = "ISO-8859-1")

ps = PorterStemmer()

def clean_string(ss):
    ss = re.sub(r'https?://\S+', 'url', ss)
    ss = re.sub(r'#', ' # ', ss)
    ss = re.sub(r'\s+', ' ', ss)
    ss = re.sub(r'[!"\$%&\(\)\*\+,\.\/:;<=>\?@^_`{\|}~]', '', ss)
    
    ss_new = ""
    for word in ss.split(" "):
        word = word.lower()
        word = ps.stem(word)
        if word not in stopwords_set:
            ss_new = ss_new + " " + word
    
    return ss_new.strip()
#     return ss

train_df["ProcessedTweet"] = train_df["OriginalTweet"].apply(clean_string)
test_df["ProcessedTweet"]  = test_df["OriginalTweet"].apply(clean_string)

In [5]:
train_df["ProcessedTweet"][2]

'coronaviru australia woolworth give elderli disabl dedic shop hour amid covid-19 outbreak url'

In [6]:
count = 0
word_dict = {}
for tweet in train_df["ProcessedTweet"]:
    for word in tweet.split(" "):
        word = word.lower()
        #stem
        #remove stop words
        if word not in word_dict:
            word_dict[word] = count
            count = count+1
count

54756

In [7]:
A = np.zeros((len(train_df["ProcessedTweet"]), count))
for i in range(0, len(train_df["ProcessedTweet"])):
    tweet_list = train_df["ProcessedTweet"][i].split(" ")
    for j in range(0, len(tweet_list)):
        word = tweet_list[j].lower()
        if word in word_dict:
            A[i][word_dict[word]] = A[i][word_dict[word]]+1

In [8]:
b = np.zeros(len(train_df["Sentiment"]))
for i in range(0, len(train_df["Sentiment"])):
    if train_df["Sentiment"][i] == "Extremely Negative":
        b[i] = 1
    elif train_df["Sentiment"][i] == "Negative":
        b[i] = 2
    elif train_df["Sentiment"][i] == "Neutral":
        b[i] = 3
    elif train_df["Sentiment"][i] == "Positive":
        b[i] = 4
    elif train_df["Sentiment"][i] == "Extremely Positive":
        b[i] = 5
    else:
        print("Something wrong happened")

## 1.2 Baseline (BOW) 

In [9]:
from sklearn.linear_model import LogisticRegression
from scipy import sparse

In [10]:
A_sparse = sparse.csr_matrix(A)

In [11]:
clf = LogisticRegression(random_state=0).fit(A_sparse, b)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [12]:
clf.score(A_sparse, b)

0.8014189566780864

In [13]:
clf.predict(A_sparse[1])

array([4.])

In [14]:
A_test = np.zeros((len(test_df["ProcessedTweet"]), count))
for i in range(0, len(test_df["ProcessedTweet"])):
    tweet_list = test_df["ProcessedTweet"][i].split(" ")
    for j in range(0, len(tweet_list)):
        word = tweet_list[j].lower()
        if word in word_dict:
            A_test[i][word_dict[word]] = A_test[i][word_dict[word]]+1

In [15]:
b_test = np.zeros(len(test_df["Sentiment"]))
for i in range(0, len(test_df["Sentiment"])):
    if test_df["Sentiment"][i] == "Extremely Negative":
        b_test[i] = 1
    elif test_df["Sentiment"][i] == "Negative":
        b_test[i] = 2
    elif test_df["Sentiment"][i] == "Neutral":
        b_test[i] = 3
    elif test_df["Sentiment"][i] == "Positive":
        b_test[i] = 4
    elif test_df["Sentiment"][i] == "Extremely Positive":
        b_test[i] = 5
    else:
        print("Something wrong happened")

In [16]:
A_sparse_test = sparse.csr_matrix(A_test)

In [17]:
clf.score(A_sparse_test, b_test)

0.5903106898367562

In [18]:
b_pred = clf.predict(A_sparse_test)
print("macro: " + str(f1_score(b_test, b_pred, average='macro')))
print("micro: " + str(f1_score(b_test, b_pred, average='micro')))
print("weighted: " + str(f1_score(b_test, b_pred, average='weighted')))

macro: 0.601081561824525
micro: 0.5903106898367562
weighted: 0.5898560616180559


## 1.3 NNMF (BOW) 

In [19]:
model = NMF(n_components=50, init='random', random_state=0)
W = model.fit_transform(A_sparse)
H = model.components_

In [20]:
clf_nnmf = LogisticRegression(random_state=0, verbose=True).fit(W, b)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s finished


In [21]:
clf_nnmf.score(W, b)

0.35857812765750663

In [22]:
# model = NMF(n_components=50, init='random', random_state=0)
W = model.transform(A_sparse_test)
H = model.components_

In [23]:
clf_nnmf.score(W, b_test)

0.33754607688256977

In [24]:
b_pred = clf_nnmf.predict(W)
print("macro: " + str(f1_score(b_test, b_pred, average='macro')))
print("micro: " + str(f1_score(b_test, b_pred, average='micro')))
print("weighted: " + str(f1_score(b_test, b_pred, average='weighted')))

macro: 0.3036834091670501
micro: 0.33754607688256977
weighted: 0.31583480844771783


## 1.4 SVD (BOW) 

In [25]:
from sklearn.decomposition import TruncatedSVD

In [27]:
model = TruncatedSVD(n_components=50, random_state=0)
A_SVD = model.fit_transform(A_sparse)

In [28]:
clf_svd = LogisticRegression(random_state=0, verbose=True).fit(A_SVD, b)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s finished


In [29]:
clf_svd.score(A_SVD, b)

0.3847705129139636

In [30]:
A_SVD_test = model.transform(A_sparse_test)

In [31]:
clf_svd.score(A_SVD_test, b_test)

0.36545550289626116

In [33]:
b_pred = clf_svd.predict(A_SVD_test)
print("macro: " + str(f1_score(b_test, b_pred, average='macro')))
print("micro: " + str(f1_score(b_test, b_pred, average='micro')))
print("weighted: " + str(f1_score(b_test, b_pred, average='weighted')))

macro: 0.35887639395353
micro: 0.36545550289626116
weighted: 0.3567150814815877


## 2.1 Creating Data Matrix (TFIDF) 

In [34]:
from sklearn.linear_model import LinearRegression

In [35]:

vectorizer = TfidfVectorizer(min_df=3,
    max_df=0.85,
    max_features=5000,
    ngram_range=(1, 2))
X = vectorizer.fit_transform(train_df["ProcessedTweet"])
X.shape

(41157, 5000)

## 2.2 Baseline (TFIDF) 

In [36]:
clf = LogisticRegression(random_state=0, verbose=True).fit(X, b)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.7s finished


In [37]:
clf.score(X, b)

0.7061739193818791

In [38]:
X_test = vectorizer.transform(test_df["ProcessedTweet"])

In [39]:
clf.score(X_test, b_test)

0.5468667719852554

In [40]:
b_pred = clf.predict(X_test)
print("macro: " + str(f1_score(b_test, b_pred, average='macro')))
print("micro: " + str(f1_score(b_test, b_pred, average='micro')))
print("weighted: " + str(f1_score(b_test, b_pred, average='weighted')))

macro: 0.5567301761861693
micro: 0.5468667719852554
weighted: 0.5474497438393306


## 2.3 NNMF (TFIDF) 

In [41]:
model = NMF(n_components=50, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [42]:
X_simp = W.dot(H)

In [43]:
clf_nnmf = LogisticRegression(random_state=0, verbose=True).fit(X_simp, b)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.3s finished


In [44]:
clf_nnmf.score(X_simp, b)

0.37784580994727507

In [45]:
W_test = model.transform(X_test)
H_test = model.components_
X_simp_test = W_test.dot(H_test)

In [46]:
clf_nnmf.score(X_simp_test, b_test)

0.35018430753027907

In [47]:
b_pred = clf_nnmf.predict(X_simp_test)
print("macro: " + str(f1_score(b_test, b_pred, average='macro')))
print("micro: " + str(f1_score(b_test, b_pred, average='micro')))
print("weighted: " + str(f1_score(b_test, b_pred, average='weighted')))

macro: 0.3340356646014494
micro: 0.35018430753027907
weighted: 0.33995099242758303


## 2.4 SVD (TFIDF) 

In [48]:
model = TruncatedSVD(n_components=50, random_state=0)
X_SVD = model.fit_transform(X)

In [49]:
clf_svd = LogisticRegression(random_state=0, verbose=True).fit(X_SVD, b)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s finished


In [50]:
clf_svd.score(X_SVD, b)

0.3986442160507326

In [51]:
X_SVD_test = model.transform(X_test)

In [52]:
clf_svd.score(X_SVD_test, b_test)

0.3762506582411796

In [53]:
b_pred = clf_svd.predict(X_SVD_test)
print("macro: " + str(f1_score(b_test, b_pred, average='macro')))
print("micro: " + str(f1_score(b_test, b_pred, average='micro')))
print("weighted: " + str(f1_score(b_test, b_pred, average='weighted')))

macro: 0.3698564115229537
micro: 0.3762506582411796
weighted: 0.3694722599588555
