## Import Libraries

In [16]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import time
import string
import warnings
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

### 1- clean_covid19au

## Loading dataset

In [17]:
tweets_df2=pd.read_csv("clean_covid19au.csv")
tweets_df2.head(3)

Unnamed: 0.1,Unnamed: 0,date,tweets,tidy_tweets,absolute_tidy_tweets,sentiment
0,0,2020-10-30 02:49:20+00:00,Talking all things Queensland borders at today...,Talking all things Queensland borders at today...,Talking thing Queensland border today presser,neutral
1,1,2020-10-29 03:12:56+00:00,All hand sanitiser sold in Australia should co...,All hand sanitiser sold in Australia should co...,All hand sanitiser sold Australia contain enou...,pos
2,2,2020-10-29 00:47:25+00:00,Saturday night a blokes night will occur as ou...,Saturday night a blokes night will occur as ou...,Saturday night bloke night occur wife go annua...,pos


## <a id='5'>5. Feature Extraction</a>

We need to convert textual representation in the form on numeric features. We have a popular techniques to perform feature extraction:


1. __TF-IDF (Term Frequency - Inverse Document Frequency)__





### <a id='5A'>A. Feature Extraction for 'Key Words'</a>

In [18]:
# TF-IDF features
tfidf_word_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')
# TF-IDF feature matrix
tfidf_word_feature = tfidf_word_vectorizer.fit_transform(tweets_df2['absolute_tidy_tweets'].astype('U'))

## <a id='6'>6. Model Building: Sentiment Analysis</a>

#### Map target variables to  {0, 1}

In [19]:
target_variable = tweets_df2['sentiment'].apply(lambda x: -1 if x=='neg' else (1 if x=="pos" else 0))

In [20]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_word_feature, target_variable, test_size=0.3, random_state=272)


## Training a Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
R_model = randomforest.fit(X_train,y_train)
R_predict=R_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [23]:
# confusion matrix


matrix = confusion_matrix(y_test,R_predict, labels=[1, 0,-1])
print("Confusion matrix")
print(matrix)




Confusion matrix
[[274  81  34]
 [ 50 179  12]
 [ 87  50 123]]


In [24]:


# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,R_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,R_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.67      0.70      0.68       389
           0       0.58      0.74      0.65       241
          -1       0.73      0.47      0.57       260

    accuracy                           0.65       890
   macro avg       0.66      0.64      0.64       890
weighted avg       0.66      0.65      0.64       890



The accuracy of the model is =  0.647191011235955


## Training a Logistic Binary Classifier

In [25]:
from sklearn.linear_model import LogisticRegression

# Create logistic regression object
logistic_regression = LogisticRegression(multi_class='multinomial', solver='newton-cg',random_state=0)
# Train model
L_model = logistic_regression.fit(X_train,y_train)
L_predict=L_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [26]:
# confusion matrix
matrix = confusion_matrix(y_test,L_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[338  12  39]
 [134  70  37]
 [123   8 129]]


In [27]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,L_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,L_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.57      0.87      0.69       389
           0       0.78      0.29      0.42       241
          -1       0.63      0.50      0.55       260

    accuracy                           0.60       890
   macro avg       0.66      0.55      0.55       890
weighted avg       0.64      0.60      0.58       890



The accuracy of the model is =  0.6033707865168539


## Training a SVM classifier

In [28]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
clf.fit(X_train, y_train)
svm_predict=clf.predict(X_test)



### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [29]:
# confusion matrix
matrix = confusion_matrix(y_test,svm_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[357  21  11]
 [145  89   7]
 [216   5  39]]


In [30]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,svm_predict,labels=[1,0])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,svm_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.50      0.92      0.64       389
           0       0.77      0.37      0.50       241

   micro avg       0.54      0.71      0.61       630
   macro avg       0.64      0.64      0.57       630
weighted avg       0.60      0.71      0.59       630



The accuracy of the model is =  0.5449438202247191


### 2- clean_lockdown_senti

## Loading dataset

In [31]:
tweets_df2=pd.read_csv("clean_lockdown_senti.csv")
tweets_df2.head(3)

Unnamed: 0.1,Unnamed: 0,date,tweets,tidy_tweets,absolute_tidy_tweets,sentiment
0,0,2020-10-30 23:56:05+00:00,Lockdown works in the short term. But is it ul...,Lockdown works in the short term. But is it ul...,Lockdown work short term But ultimately best s...,pos
1,1,2020-10-30 23:48:39+00:00,Just to finish off Inktober2020 with a smile. ...,Just to finish off Inktober2020 with a smile. ...,Just finish Inktober smile Thank reacted comme...,pos
2,2,2020-10-30 23:32:28+00:00,Lmao Melb defs going into a 3rd and 4th lockdo...,Lmao Melb defs going into a 3rd and 4th lockdo...,Lmao Melb defs going rd th lockdown People ful...,pos


## <a id='5'>5. Feature Extraction</a>

We need to convert textual representation in the form on numeric features. We have a popular techniques to perform feature extraction:


1. __TF-IDF (Term Frequency - Inverse Document Frequency)__





### <a id='5A'>A. Feature Extraction for 'Key Words'</a>

In [32]:
# TF-IDF features
tfidf_word_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')
# TF-IDF feature matrix
tfidf_word_feature = tfidf_word_vectorizer.fit_transform(tweets_df2['absolute_tidy_tweets'].astype('U'))

## <a id='6'>6. Model Building: Sentiment Analysis</a>

#### Map target variables to  {0, 1}

In [33]:
target_variable = tweets_df2['sentiment'].apply(lambda x: -1 if x=='neg' else (1 if x=="pos" else 0))

In [34]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_word_feature, target_variable, test_size=0.3, random_state=272)


## Training a Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
R_model = randomforest.fit(X_train,y_train)
R_predict=R_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [37]:
# confusion matrix


matrix = confusion_matrix(y_test,R_predict, labels=[1, 0,-1])
print("Confusion matrix")
print(matrix)




Confusion matrix
[[1534  232  144]
 [ 175  799   64]
 [ 422  186  498]]


In [38]:


# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,R_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,R_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.72      0.80      0.76      1910
           0       0.66      0.77      0.71      1038
          -1       0.71      0.45      0.55      1106

    accuracy                           0.70      4054
   macro avg       0.69      0.67      0.67      4054
weighted avg       0.70      0.70      0.69      4054



The accuracy of the model is =  0.698322644301924


## Training a Logistic Binary Classifier

In [39]:
from sklearn.linear_model import LogisticRegression

# Create logistic regression object
logistic_regression = LogisticRegression(multi_class='multinomial', solver='newton-cg',random_state=0)
# Train model
L_model = logistic_regression.fit(X_train,y_train)
L_predict=L_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [40]:
# confusion matrix
matrix = confusion_matrix(y_test,L_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[1648  105  157]
 [ 263  678   97]
 [ 346  115  645]]


In [41]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,L_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,L_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.73      0.86      0.79      1910
           0       0.76      0.65      0.70      1038
          -1       0.72      0.58      0.64      1106

    accuracy                           0.73      4054
   macro avg       0.73      0.70      0.71      4054
weighted avg       0.73      0.73      0.73      4054



The accuracy of the model is =  0.7328564380858411


## Training a SVM classifier

In [42]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
clf.fit(X_train, y_train)
svm_predict=clf.predict(X_test)



### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [43]:
# confusion matrix
matrix = confusion_matrix(y_test,svm_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[1770   63   77]
 [ 614  374   50]
 [ 758   38  310]]


In [44]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,svm_predict,labels=[1,0])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,svm_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.56      0.93      0.70      1910
           0       0.79      0.36      0.49      1038

   micro avg       0.59      0.73      0.65      2948
   macro avg       0.68      0.64      0.60      2948
weighted avg       0.64      0.73      0.63      2948



The accuracy of the model is =  0.6053280710409472


### 1- clean_mask_senti

## Loading dataset

In [45]:
tweets_df2=pd.read_csv("clean_mask_senti.csv")
tweets_df2.head(3)

Unnamed: 0.1,Unnamed: 0,date,tweets,tidy_tweets,absolute_tidy_tweets,sentiment
0,0,2020-12-31 23:46:53+00:00,The Victorian Health Minister has clarified ru...,The Victorian Health Minister has clarified ru...,The Victorian Health Minister clarified rule a...,pos
1,1,2020-12-31 23:22:47+00:00,@Michael01996367 @elonmusk @Tesla @TeslaGong @...,@Michael01996367 @elonmusk @Tesla @TeslaGong @...,Michael elonmusk Tesla TeslaGong TeslaTom Tesl...,neutral
2,2,2020-12-31 22:46:38+00:00,Mask 😷 wearing deemed to be too much of a burd...,Mask 😷 wearing deemed to be too much of a burd...,Mask wearing deemed much burden ppl NSW accord...,neg


## <a id='5'>5. Feature Extraction</a>

We need to convert textual representation in the form on numeric features. We have a popular techniques to perform feature extraction:


1. __TF-IDF (Term Frequency - Inverse Document Frequency)__





### <a id='5A'>A. Feature Extraction for 'Key Words'</a>

In [46]:
# TF-IDF features
tfidf_word_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')
# TF-IDF feature matrix
tfidf_word_feature = tfidf_word_vectorizer.fit_transform(tweets_df2['absolute_tidy_tweets'].astype('U'))

## <a id='6'>6. Model Building: Sentiment Analysis</a>

#### Map target variables to  {0, 1}

In [47]:
target_variable = tweets_df2['sentiment'].apply(lambda x: -1 if x=='neg' else (1 if x=="pos" else 0))

In [48]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_word_feature, target_variable, test_size=0.3, random_state=272)


## Training a Random Forest Classifier

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
R_model = randomforest.fit(X_train,y_train)
R_predict=R_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [51]:
# confusion matrix


matrix = confusion_matrix(y_test,R_predict, labels=[1, 0,-1])
print("Confusion matrix")
print(matrix)




Confusion matrix
[[1483  207   31]
 [ 221  831   16]
 [ 422  163  206]]


In [52]:


# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,R_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,R_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.70      0.86      0.77      1721
           0       0.69      0.78      0.73      1068
          -1       0.81      0.26      0.39       791

    accuracy                           0.70      3580
   macro avg       0.73      0.63      0.63      3580
weighted avg       0.72      0.70      0.68      3580



The accuracy of the model is =  0.7039106145251397


## Training a Logistic Binary Classifier

In [53]:
from sklearn.linear_model import LogisticRegression

# Create logistic regression object
logistic_regression = LogisticRegression(multi_class='multinomial', solver='newton-cg',random_state=0)
# Train model
L_model = logistic_regression.fit(X_train,y_train)
L_predict=L_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [54]:
# confusion matrix
matrix = confusion_matrix(y_test,L_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[1536  113   72]
 [ 268  745   55]
 [ 328  130  333]]


In [55]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,L_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,L_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.72      0.89      0.80      1721
           0       0.75      0.70      0.72      1068
          -1       0.72      0.42      0.53       791

    accuracy                           0.73      3580
   macro avg       0.73      0.67      0.68      3580
weighted avg       0.73      0.73      0.72      3580



The accuracy of the model is =  0.7301675977653631


## Training a SVM classifier

In [56]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
clf.fit(X_train, y_train)
svm_predict=clf.predict(X_test)



### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [57]:
# confusion matrix
matrix = confusion_matrix(y_test,svm_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[1606  109    6]
 [ 654  405    9]
 [ 655   63   73]]


In [58]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,svm_predict,labels=[1,0])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,svm_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.55      0.93      0.69      1721
           0       0.70      0.38      0.49      1068

   micro avg       0.58      0.72      0.64      2789
   macro avg       0.63      0.66      0.59      2789
weighted avg       0.61      0.72      0.62      2789



The accuracy of the model is =  0.582122905027933


### 4- clean_quarantine

## Loading dataset

In [59]:
tweets_df2=pd.read_csv("clean_quarantine.csv")
tweets_df2.head(3)

Unnamed: 0.1,Unnamed: 0,date,tweets,tidy_tweets,absolute_tidy_tweets,sentiment
0,0,2020-10-30 21:22:16+00:00,quarantine?,quarantine?,quarantine,neutral
1,1,2020-10-30 20:10:28+00:00,cristiano ronaldo tested negative covid 19 qua...,cristiano ronaldo tested negative covid 19 qua...,cristiano ronaldo tested negative covid quaran...,neg
2,2,2020-10-30 19:09:21+00:00,evening cravings 💕 hightea quarantine style co...,evening cravings 💕 hightea quarantine style co...,evening craving hightea quarantine style cooky...,neutral


## <a id='5'>5. Feature Extraction</a>

We need to convert textual representation in the form on numeric features. We have a popular techniques to perform feature extraction:


1. __TF-IDF (Term Frequency - Inverse Document Frequency)__





### <a id='5A'>A. Feature Extraction for 'Key Words'</a>

In [60]:
# TF-IDF features
tfidf_word_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')
# TF-IDF feature matrix
tfidf_word_feature = tfidf_word_vectorizer.fit_transform(tweets_df2['absolute_tidy_tweets'].astype('U'))

## <a id='6'>6. Model Building: Sentiment Analysis</a>

#### Map target variables to  {0, 1}

In [61]:
target_variable = tweets_df2['sentiment'].apply(lambda x: -1 if x=='neg' else (1 if x=="pos" else 0))

In [62]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_word_feature, target_variable, test_size=0.3, random_state=272)


## Training a Random Forest Classifier

In [63]:
from sklearn.ensemble import RandomForestClassifier

In [64]:
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
R_model = randomforest.fit(X_train,y_train)
R_predict=R_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [65]:
# confusion matrix


matrix = confusion_matrix(y_test,R_predict, labels=[1, 0,-1])
print("Confusion matrix")
print(matrix)




Confusion matrix
[[504 130  62]
 [ 52 343  29]
 [146 110 214]]


In [66]:


# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,R_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,R_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.72      0.72      0.72       696
           0       0.59      0.81      0.68       424
          -1       0.70      0.46      0.55       470

    accuracy                           0.67      1590
   macro avg       0.67      0.66      0.65      1590
weighted avg       0.68      0.67      0.66      1590



The accuracy of the model is =  0.6672955974842767


## Training a Logistic multiclass Classifier

In [67]:
from sklearn.linear_model import LogisticRegression

# Create logistic regression object
logistic_regression = LogisticRegression(multi_class='multinomial', solver='newton-cg',random_state=0)
# Train model
L_model = logistic_regression.fit(X_train,y_train)
L_predict=L_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [68]:
# confusion matrix
matrix = confusion_matrix(y_test,L_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[566  56  74]
 [140 239  45]
 [166  58 246]]


In [69]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,L_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,L_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.65      0.81      0.72       696
           0       0.68      0.56      0.62       424
          -1       0.67      0.52      0.59       470

    accuracy                           0.66      1590
   macro avg       0.67      0.63      0.64      1590
weighted avg       0.66      0.66      0.65      1590



The accuracy of the model is =  0.6610062893081761


## Training a SVM classifier

In [70]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
clf.fit(X_train, y_train)
svm_predict=clf.predict(X_test)



### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [71]:
# confusion matrix
matrix = confusion_matrix(y_test,svm_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[627  47  22]
 [263 142  19]
 [324  39 107]]


In [72]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,svm_predict,labels=[1,0])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,svm_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.52      0.90      0.66       696
           0       0.62      0.33      0.44       424

   micro avg       0.53      0.69      0.60      1120
   macro avg       0.57      0.62      0.55      1120
weighted avg       0.56      0.69      0.57      1120



The accuracy of the model is =  0.5509433962264151


### 5- mask_senti

## Loading dataset

In [73]:
tweets_df2=pd.read_csv("clean_soc_distanc_senti.csv")
tweets_df2.head(3)

Unnamed: 0.1,Unnamed: 0,date,tweets,tidy_tweets,absolute_tidy_tweets,sentiment
0,0,2020-10-30 22:06:12+00:00,Social Distancing w/ @GAROFALI by @PatSupsiri ...,Social Distancing w/ @GAOFALI by @PatSupsiri,Social Distancing w GAOFALI PatSupsiri,neutral
1,1,2020-10-30 21:09:12+00:00,Really impressed by @Jetstar_NZ strongly recom...,eally impressed by @Jetstar_NZ strongly recomm...,eally impressed JetstarNZ strongly recommendin...,pos
2,2,2020-10-30 19:36:57+00:00,@EmilyQMD Great work @EmilyQMD In Australia we...,@EmilyQMD Great work @EmilyQMD In Australia we...,EmilyQMD Great work EmilyQMD In Australia weve...,pos


## <a id='5'>5. Feature Extraction</a>

We need to convert textual representation in the form on numeric features. We have a popular techniques to perform feature extraction:


1. __TF-IDF (Term Frequency - Inverse Document Frequency)__





### <a id='5A'>A. Feature Extraction for 'Key Words'</a>

In [74]:
# TF-IDF features
tfidf_word_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')
# TF-IDF feature matrix
tfidf_word_feature = tfidf_word_vectorizer.fit_transform(tweets_df2['absolute_tidy_tweets'].astype('U'))

## <a id='6'>6. Model Building: Sentiment Analysis</a>

#### Map target variables to  {0, 1}

In [75]:
target_variable = tweets_df2['sentiment'].apply(lambda x: -1 if x=='neg' else (1 if x=="pos" else 0))

In [76]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_word_feature, target_variable, test_size=0.3, random_state=272)


## Training a Random Forest Classifier

In [77]:
from sklearn.ensemble import RandomForestClassifier

In [78]:
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
R_model = randomforest.fit(X_train,y_train)
R_predict=R_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [79]:
# confusion matrix


matrix = confusion_matrix(y_test,R_predict, labels=[1, 0,-1])
print("Confusion matrix")
print(matrix)




Confusion matrix
[[808 135  32]
 [140 451  10]
 [240  86 132]]


In [80]:


# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,R_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,R_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.68      0.83      0.75       975
           0       0.67      0.75      0.71       601
          -1       0.76      0.29      0.42       458

    accuracy                           0.68      2034
   macro avg       0.70      0.62      0.62      2034
weighted avg       0.70      0.68      0.66      2034



The accuracy of the model is =  0.683874139626352


## Training a Logistic Binary Classifier

In [81]:
from sklearn.linear_model import LogisticRegression

# Create logistic regression object
logistic_regression = LogisticRegression(multi_class='multinomial', solver='newton-cg',random_state=0)
# Train model
L_model = logistic_regression.fit(X_train,y_train)
L_predict=L_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [82]:
# confusion matrix
matrix = confusion_matrix(y_test,L_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[849  85  41]
 [159 411  31]
 [197  74 187]]


In [83]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,L_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,L_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.70      0.87      0.78       975
           0       0.72      0.68      0.70       601
          -1       0.72      0.41      0.52       458

    accuracy                           0.71      2034
   macro avg       0.72      0.65      0.67      2034
weighted avg       0.71      0.71      0.70      2034



The accuracy of the model is =  0.7114060963618486


## Training a SVM classifier

In [84]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
clf.fit(X_train, y_train)
svm_predict=clf.predict(X_test)



### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [85]:
# confusion matrix
matrix = confusion_matrix(y_test,svm_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[910  61   4]
 [330 269   2]
 [396  22  40]]


In [86]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,svm_predict,labels=[1,0])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,svm_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.56      0.93      0.70       975
           0       0.76      0.45      0.56       601

   micro avg       0.59      0.75      0.66      1576
   macro avg       0.66      0.69      0.63      1576
weighted avg       0.64      0.75      0.65      1576



The accuracy of the model is =  0.5993117010816126


### 6- clean_vaccine_senti

## Loading dataset

In [87]:
tweets_df2=pd.read_csv("clean_vaccine_senti.csv")
tweets_df2.head(3)

Unnamed: 0.1,Unnamed: 0,date,tweets,tidy_tweets,absolute_tidy_tweets,sentiment
0,0,2020-10-30 21:43:08+00:00,@tls_fletcher Hope ➕decent president ➕vaccine ✅,@tls_fletcher Hope ➕decent president ➕vaccine ✅,tlsfletcher Hope decent president vaccine,pos
1,1,2020-10-30 15:43:24+00:00,would you date someone who..?\n\n1 - done \n2 ...,would you date someone who..? 1 - done 2 - lyf...,would date someone done lyfe dont think nahh s...,neg
2,2,2020-10-30 14:00:59+00:00,@CNBC They should offer free covid vaccines to...,@CNBC They should offer free covid vaccines to...,CNBC They offer free covid vaccine Apple Fanboys,pos


## <a id='5'>5. Feature Extraction</a>

We need to convert textual representation in the form on numeric features. We have a popular techniques to perform feature extraction:


1. __TF-IDF (Term Frequency - Inverse Document Frequency)__





### <a id='5A'>A. Feature Extraction for 'Key Words'</a>

In [88]:
# TF-IDF features
tfidf_word_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')
# TF-IDF feature matrix
tfidf_word_feature = tfidf_word_vectorizer.fit_transform(tweets_df2['absolute_tidy_tweets'].astype('U'))

## <a id='6'>6. Model Building: Sentiment Analysis</a>

#### Map target variables to  {0, 1}

In [89]:
target_variable = tweets_df2['sentiment'].apply(lambda x: -1 if x=='neg' else (1 if x=="pos" else 0))

In [90]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_word_feature, target_variable, test_size=0.3, random_state=272)


## Training a Random Forest Classifier

In [91]:
from sklearn.ensemble import RandomForestClassifier

In [92]:
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1)
R_model = randomforest.fit(X_train,y_train)
R_predict=R_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [93]:
# confusion matrix


matrix = confusion_matrix(y_test,R_predict, labels=[1, 0,-1])
print("Confusion matrix")
print(matrix)




Confusion matrix
[[183  31  49]
 [ 36  98   9]
 [ 85  30  88]]


In [94]:


# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,R_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,R_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.60      0.70      0.65       263
           0       0.62      0.69      0.65       143
          -1       0.60      0.43      0.50       203

    accuracy                           0.61       609
   macro avg       0.61      0.60      0.60       609
weighted avg       0.61      0.61      0.60       609



The accuracy of the model is =  0.6059113300492611


## Training a Logistic Binary Classifier

In [95]:
from sklearn.linear_model import LogisticRegression

# Create logistic regression object
logistic_regression = LogisticRegression(multi_class='multinomial', solver='newton-cg',random_state=0)
# Train model
L_model = logistic_regression.fit(X_train,y_train)
L_predict=L_model.predict(X_test)

### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [96]:
# confusion matrix
matrix = confusion_matrix(y_test,L_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[219   3  41]
 [105  16  22]
 [111   2  90]]


In [97]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,L_predict,labels=[1,0,-1])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,L_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.50      0.83      0.63       263
           0       0.76      0.11      0.20       143
          -1       0.59      0.44      0.51       203

    accuracy                           0.53       609
   macro avg       0.62      0.46      0.44       609
weighted avg       0.59      0.53      0.49       609



The accuracy of the model is =  0.5336617405582923


## Training a SVM classifier

In [98]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
clf.fit(X_train, y_train)
svm_predict=clf.predict(X_test)



### Testing the model
1-Confusion matrix <br>
2-Accuray <br>
3-F1 score <br>
4-Other matrix<br>

In [99]:
# confusion matrix
matrix = confusion_matrix(y_test,svm_predict, labels=[1,0,-1])
#plot_confusion_matrix(matrix)
print("Confusion matrix")
print(matrix)

Confusion matrix
[[241  13   9]
 [ 95  46   2]
 [167  15  21]]


In [100]:

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,svm_predict,labels=[1,0])
print('\nClassification report : \n',matrix)

# Accuracy
print("\n\nThe accuracy of the model is = ", accuracy_score(y_test,svm_predict))


Classification report : 
               precision    recall  f1-score   support

           1       0.48      0.92      0.63       263
           0       0.62      0.32      0.42       143

   micro avg       0.50      0.71      0.58       406
   macro avg       0.55      0.62      0.53       406
weighted avg       0.53      0.71      0.56       406



The accuracy of the model is =  0.5057471264367817
