##### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2022 Semester 1

## Assignment 2: Sentiment Classification of Tweets

This is a sample code to assist you with vectorising the 'Train' dataset for your assignment 2.

First we read the CSV datafiles (Train and Test).

In [3]:
import pandas as pd
# nltk
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

train_data = pd.read_csv("Train.csv", sep=',')
test_data = pd.read_csv("Test.csv", sep=',')

AttributeError: partially initialized module 'nltk' has no attribute 'internals' (most likely due to a circular import)

Then we separate the tweet text and the label (sentiment). 

In [None]:
#separating instance and label for Train
X_train_raw = [x[0] for x in train_data[['text']].values]
Y_train = [x[0] for x in train_data[['sentiment']].values]

#check the result
print("Train length:",len(X_train_raw))

#separating instance and label for Test
X_test_raw = [x[0] for x in test_data[['text']].values]

#separating id for Test
X_test_id = [x[0] for x in test_data[['id']].values]

#check the result
print("Test length:",len(X_test_raw))

Train length: 21802
Test length: 6099


Preprocess

In [None]:
X_train_raw = [x[0] for x in train_data[['text']].values]
X_train_raw = [item.lower() for item in X_train_raw]
X_train_raw = [re.sub('((www.[^s]+)|(https?://[^s]+))',' ',item) for item in X_train_raw]
X_train_raw = [re.sub('[^a-z ]+', "", item) for item in X_train_raw]

stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']
STOPWORDS = set(stopwordlist)
X_train_raw = [" ".join([word for word in str(item).split() if word not in STOPWORDS]) for item in X_train_raw]

tokenizer = RegexpTokenizer('\w+')
X_train_raw = [tokenizer.tokenize(item) for item in X_train_raw]

ps = nltk.PorterStemmer()
def stem_string(arr):
    arr = [ps.stem(item) for item in arr]
    return arr
X_train_raw = [stem_string(item) for item in X_train_raw]

lm = nltk.WordNetLemmatizer()
def lemmatize_string(arr):
    arr = [lm.lemmatize(item) for item in arr]
    return arr

X_train_raw = [lemmatize_string(item) for item in X_train_raw]
X_train_raw = [' '.join(item) for item in X_train_raw]

In [None]:
#Let's see one example tweet
print(X_train_raw[1])

anybodi go radio station tomorrow see shawn friend may go but would like make new friendsmeet


### 1. Bag of Words (BoW)
In this approach, we use the **CountVectorizer** library to separate all the words in the Train corpus (dataset). These words are then used as the 'vectors' or 'features' to represent each instance (Tweet) in `Train` and `Test` datasets. 

In [None]:
BoW_vectorizer = CountVectorizer()

#Build the feature set (vocabulary) and vectorise the Train dataset using BoW
X_train_BoW = BoW_vectorizer.fit_transform(X_train_raw)

#Use the feature set (vocabulary) from Train to vectorise the Test dataset 
X_test_BoW = BoW_vectorizer.transform(X_test_raw)

print("Train feature space size (using BoW):",X_train_BoW.shape)
print("Test feature space size (using BoW):",X_test_BoW.shape)

Train feature space size (using BoW): (21802, 31181)
Test feature space size (using BoW): (6099, 31181)


Now each row is a list of tuples with the vector_id (word_id in the vocabulary) and the number of times it repeated in that given instance (tweet).

In [None]:
#Let's see one example tweet using the BoW feature space
print(X_train_BoW[1])

  (0, 1205)	1
  (0, 9908)	2
  (0, 20340)	1
  (0, 25329)	1
  (0, 28127)	1
  (0, 22624)	1
  (0, 23146)	1
  (0, 9220)	1
  (0, 15623)	1
  (0, 3650)	1
  (0, 30586)	1
  (0, 14538)	1
  (0, 15214)	1
  (0, 17371)	1
  (0, 9226)	1


We can save the created vocabulary for the given dataset in a separate file.

In [None]:
output_dict = BoW_vectorizer.vocabulary_
output_pd = pd.DataFrame(list(output_dict.items()),columns = ['word','count'])

output_pd.T.to_csv('BoW-vocab.csv',index=False)

### 2. TFIDF
In this approach, we use the **TfidfVectorizer** library to separate all the words in this corpus (dataset). Same as the BoW approach, these words are then used as the 'vectors' or 'features' to represent each instance (Tweet).

However, in this method for each instance the value associated with each 'vector' (word) is not the number of times the word repeated in that tweet, but the TFIDF value of then 'voctor' (word).

In [None]:
tfidf_vectorizer = TfidfVectorizer()

#Build the feature set (vocabulary) and vectorise the Tarin dataset using TFIDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw)

#Use the feature set (vocabulary) from Train to vectorise the Test dataset 
X_test_tfidf = tfidf_vectorizer.transform(X_test_raw)

print("Train feature space size (using TFIDF):",X_train_BoW.shape)
print("Test feature space size (using TFIDF):",X_test_BoW.shape)


Train feature space size (using TFIDF): (21802, 31181)
Test feature space size (using TFIDF): (6099, 31181)


In [None]:
#Let's see one example tweet using the TFIDF feature space
print(X_train_tfidf[1])

  (0, 9226)	0.4471814697109317
  (0, 17371)	0.19289699614243974
  (0, 15214)	0.1942236816102367
  (0, 14538)	0.17174454464915798
  (0, 30586)	0.2091585381468773
  (0, 3650)	0.15495862301487676
  (0, 15623)	0.1407707994292212
  (0, 9220)	0.25553461961314927
  (0, 23146)	0.24986718607333058
  (0, 22624)	0.17483383447481923
  (0, 28127)	0.1487715003874696
  (0, 25329)	0.34940836675605325
  (0, 20340)	0.3110076520928755
  (0, 9908)	0.3220517114870673
  (0, 1205)	0.32413564324231986


Baseline model 0R

In [None]:
#Build 0R

#split training dataset into 3 class, possitive, negative and neutral

train_data = pd.read_csv("Train.csv", sep=',')
positive_set = train_data [(train_data ["sentiment"] == 'positive')]
neutral_set = train_data [(train_data ["sentiment"] == 'neutral')]
negative_set = train_data [(train_data ["sentiment"] == 'negative')]
print(train_data["sentiment"].value_counts())

#find the class with the most instance
max_size = len(positive_set)
max_dataset = positive_set
for data_set in [neutral_set, negative_set]:
    if len(data_set) > max_size:
        max_dataset = data_set
        max_size = len(data_set)

# the model will use the class with the most instance to classify all of the test data
chosen_class = max_dataset.iloc[0]["sentiment"]

# classify test set
test_data = pd.read_csv("Test.csv", sep=',')
test_data['sentiment'] = chosen_class
(test_data.drop(['text'], axis=1)).to_csv('base.csv', index=False)


neutral     12659
positive     5428
negative     3715
Name: sentiment, dtype: int64


Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

BNBmodel = BernoulliNB()
BNBmodel.fit(X_train_tfidf, Y_train)
y_pred = BNBmodel.predict(X_test_tfidf)

data = {'id':X_test_id,'text':X_test_raw,'sentiment':y_pred}
predictions = pd.DataFrame(data=data)
(predictions.drop(['text'], axis=1)).to_csv('bnbmodel.csv', index=False)

SVM

In [None]:
from sklearn.svm import LinearSVC

SVMmodel = LinearSVC()
SVMmodel.fit(X_train_tfidf, Y_train)
y_pred2 = SVMmodel.predict(X_test_tfidf)

data = {'id':X_test_id,'text':X_test_raw,'sentiment':y_pred2}
predictions = pd.DataFrame(data=data)
(predictions.drop(['text'], axis=1)).to_csv('svmmodel.csv', index=False)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier


random_forest_model=RandomForestClassifier(n_estimators=100)
random_forest_model.fit(X_train_tfidf, Y_train)
y_pred3 = random_forest_model.predict(X_test_tfidf)

data = {'id':X_test_id,'text':X_test_raw,'sentiment':y_pred3}
predictions = pd.DataFrame(data=data)
(predictions.drop(['text'], axis=1)).to_csv('rfmodel.csv', index=False)