##### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2022 Semester 1

## Assignment 2: Sentiment Classification of Tweets

This is a sample code to assist you with vectorising the 'Train' dataset for your assignment 2.

First we read the CSV datafiles (Train and Test).

In [43]:
import pandas as pd
import re
# nltk
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# train_data = pd.read_csv("Train.csv", sep=',')
# test_data = pd.read_csv("Test.csv", sep=',')

big_data = pd.read_csv("Train.csv", sep=',')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Quang\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Quang\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Preprocess

In [44]:
X_train_raw = [x[0] for x in big_data[['text']].values]
X_train_raw = [item.lower() for item in X_train_raw]
X_train_raw = [re.sub('((www.[^s]+)|(https?://[^s]+))',' ',item) for item in X_train_raw]
# X_train_raw = [re.sub('[0-9]+', "", item) for item in X_train_raw]
X_train_raw = [re.sub('[^a-z ]+', "", item) for item in X_train_raw]

# stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
#              'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
#              'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
#              'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
#              'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
#              'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
#              'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
#              'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
#              'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
#              't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
#              'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
#              'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
#              'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
#              'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
#              "youve", 'your', 'yours', 'yourself', 'yourselves']
# STOPWORDS = set(stopwordlist)
# X_train_raw = [" ".join([word for word in str(item).split() if word not in STOPWORDS]) for item in X_train_raw]

tokenizer = RegexpTokenizer('\w+')
X_train_raw = [tokenizer.tokenize(item) for item in X_train_raw]

ps = nltk.PorterStemmer()
def stem_string(arr):
    arr = [ps.stem(item) for item in arr]
    return arr
X_train_raw = [stem_string(item) for item in X_train_raw]

lm = nltk.WordNetLemmatizer()
def lemmatize_string(arr):
    arr = [lm.lemmatize(item) for item in arr]
    return arr

X_train_raw = [lemmatize_string(item) for item in X_train_raw]
X_train_raw = [' '.join(item) for item in X_train_raw]


Then we separate the tweet text and the label (sentiment). 

In [45]:
#separating instance and label for Train
# X = [x[0] for x in big_data[['text']].values]
y = [x[0] for x in big_data[['sentiment']].values]

#check the result
print("Train length:",len(X_train_raw))

X_train, X_test, y_train, y_test = train_test_split(X_train_raw,y,test_size = 0.05, random_state =26105111)

Train length: 21802


Preprocess

In [46]:
#Let's see one example tweet
print(X_train_raw[1])

is anybodi go to the radio station tomorrow to see shawn me and my friend may go but we would like to make new friendsmeet there


### 1. Bag of Words (BoW)
In this approach, we use the **CountVectorizer** library to separate all the words in the Train corpus (dataset). These words are then used as the 'vectors' or 'features' to represent each instance (Tweet) in `Train` and `Test` datasets. 

In [47]:
BoW_vectorizer = CountVectorizer()

#Build the feature set (vocabulary) and vectorise the Train dataset using BoW
X_train_BoW = BoW_vectorizer.fit_transform(X_train)

#Use the feature set (vocabulary) from Train to vectorise the Test dataset 
X_test_BoW = BoW_vectorizer.transform(X_test)

print("Train feature space size (using BoW):",X_train_BoW.shape)
print("Test feature space size (using BoW):",X_test_BoW.shape)

Train feature space size (using BoW): (20711, 30126)
Test feature space size (using BoW): (1091, 30126)


Now each row is a list of tuples with the vector_id (word_id in the vocabulary) and the number of times it repeated in that given instance (tweet).

In [48]:
#Let's see one example tweet using the BoW feature space
print(X_train_BoW[1])

  (0, 16557)	1
  (0, 10987)	1
  (0, 6668)	1
  (0, 5830)	1
  (0, 17630)	1
  (0, 26911)	1
  (0, 4838)	1
  (0, 1560)	1
  (0, 15194)	1
  (0, 29913)	1
  (0, 16366)	1


We can save the created vocabulary for the given dataset in a separate file.

In [49]:
output_dict = BoW_vectorizer.vocabulary_
output_pd = pd.DataFrame(list(output_dict.items()),columns = ['word','count'])

output_pd.T.to_csv('BoW-vocab.csv',index=False)

### 2. TFIDF
In this approach, we use the **TfidfVectorizer** library to separate all the words in this corpus (dataset). Same as the BoW approach, these words are then used as the 'vectors' or 'features' to represent each instance (Tweet).

However, in this method for each instance the value associated with each 'vector' (word) is not the number of times the word repeated in that tweet, but the TFIDF value of then 'voctor' (word).

In [50]:
tfidf_vectorizer = TfidfVectorizer()

#Build the feature set (vocabulary) and vectorise the Tarin dataset using TFIDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

#Use the feature set (vocabulary) from Train to vectorise the Test dataset 
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Train feature space size (using TFIDF):",X_train_BoW.shape)
print("Test feature space size (using TFIDF):",X_test_BoW.shape)


Train feature space size (using TFIDF): (20711, 30126)
Test feature space size (using TFIDF): (1091, 30126)


In [51]:
#Let's see one example tweet using the TFIDF feature space
print(X_train_tfidf[1])

  (0, 16366)	0.5860882228462222
  (0, 29913)	0.17589630634099077
  (0, 15194)	0.22839016001792783
  (0, 1560)	0.19575906631021145
  (0, 4838)	0.25894906703086046
  (0, 26911)	0.2860805860886641
  (0, 17630)	0.15248928110796536
  (0, 5830)	0.2217932059675435
  (0, 6668)	0.34429029424948393
  (0, 10987)	0.32772901731539383
  (0, 16557)	0.29629944920099127


Baseline model 0R

In [52]:
#Build 0R

#split training dataset into 3 class, possitive, negative and neutral

train_data = pd.read_csv("Train.csv", sep=',')
positive_set = train_data [(train_data ["sentiment"] == 'positive')]
neutral_set = train_data [(train_data ["sentiment"] == 'neutral')]
negative_set = train_data [(train_data ["sentiment"] == 'negative')]
print(train_data["sentiment"].value_counts())

#find the class with the most instance
max_size = len(positive_set)
max_dataset = positive_set
for data_set in [neutral_set, negative_set]:
    if len(data_set) > max_size:
        max_dataset = data_set
        max_size = len(data_set)

# the model will use the class with the most instance to classify all of the test data
chosen_class = max_dataset.iloc[0]["sentiment"]

# classify test set
test_data = pd.read_csv("Test.csv", sep=',')
test_data['sentiment'] = chosen_class
(test_data.drop(['text'], axis=1)).to_csv('base.csv', index=False)


neutral     12659
positive     5428
negative     3715
Name: sentiment, dtype: int64


Bernoulli Naive Bayes

In [53]:
from sklearn.naive_bayes import BernoulliNB

BNBmodel = BernoulliNB()
BNBmodel.fit(X_train_tfidf, y_train)
y_pred = BNBmodel.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.61      0.10      0.18       183
     neutral       0.60      0.92      0.73       609
    positive       0.69      0.29      0.41       299

    accuracy                           0.61      1091
   macro avg       0.63      0.44      0.44      1091
weighted avg       0.63      0.61      0.55      1091



SVM

In [54]:
from sklearn.svm import LinearSVC

SVMmodel = LinearSVC()
SVMmodel.fit(X_train_tfidf, y_train)
y_pred2 = SVMmodel.predict(X_test_tfidf)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

    negative       0.55      0.42      0.47       183
     neutral       0.66      0.78      0.72       609
    positive       0.64      0.49      0.55       299

    accuracy                           0.64      1091
   macro avg       0.62      0.56      0.58      1091
weighted avg       0.64      0.64      0.63      1091



Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier


random_forest_model=RandomForestClassifier(n_estimators=100)
random_forest_model.fit(X_train_tfidf, y_train)
y_pred3 = random_forest_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

    negative       0.50      0.04      0.08       183
     neutral       0.60      0.92      0.72       609
    positive       0.67      0.31      0.42       299

    accuracy                           0.60      1091
   macro avg       0.59      0.42      0.41      1091
weighted avg       0.60      0.60      0.53      1091



In [56]:
def model_Evaluate(model):
# Predict values for Test dataset
y_pred = model.predict(X_test)
# Print the evaluation metrics for the dataset.
print(classification_report(y_test, y_pred))
# Compute and plot the Confusion matrix
# cf_matrix = confusion_matrix(y_test, y_pred)
# categories = ['Negative','Positive']
# group_names = ['True Neg','False Pos', 'False Neg','True Pos']
# group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]
# labels = [f'{v1}n{v2}' for v1, v2 in zip(group_names,group_percentages)]
# labels = np.asarray(labels).reshape(2,2)
# sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
# xticklabels = categories, yticklabels = categories)
# plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
# plt.ylabel("Actual values" , fontdict = {'size':14}, labelpad = 10)
# plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

IndentationError: expected an indented block (Temp/ipykernel_8812/635345592.py, line 3)