## --------------------------Testing on Ground Truth Dataset------------------------ 

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sb
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")
import unidecode
#from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.stem import PorterStemmer
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import matplotlib.animation as animation
import operator
#import plotly.express as px
from collections import Counter
%matplotlib inline
import pandas_profiling

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data=pd.read_csv("results.csv")

In [3]:
data.head()

Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


In [4]:
data.dropna
data.describe()

Unnamed: 0,text,sentiment
count,3534,3534
unique,3530,3
top,#NAME?,neutral
freq,5,1430


In [5]:
## Convert to lower case
data['text'] = data['text'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
data['text'].head()

0     last session of the day http://twitpic.com/67ezh
1    shanghai is also really exciting (precisely --...
2    recession hit veronique branquinho, she has to...
3                                          happy bday!
4               http://twitpic.com/4w75p - i like it!!
Name: text, dtype: object

In [6]:
## Stopwards Removal
data['text'] = data['text'].apply(lambda x : ' '.join([word for word in str(x).split() if not word in set(stopwords.words('english'))]))

In [7]:
## Lemmatization
lemmatizer = WordNetLemmatizer()
data['text'] = data['text'].apply(lambda x : ' '.join([lemmatizer.lemmatize(word) for word in str(x).split()]))

In [8]:
from textblob import TextBlob
data['text'][:5].apply(lambda x: str(TextBlob(x).correct()))

0            last session day http://twitpic.com/67ezh
1    shanghai also really exciting (precisely -- sk...
2    recession hit veronique branquinho, quit compa...
3                                           happy day!
4                 http://twitpic.com/4w75p - like it!!
Name: text, dtype: object

In [9]:
def clean_tweets(tweet):
    tweet = re.sub('http://[A-Za-z0-9./]+','',tweet) 
    # remove special characters 
    tweet = re.sub('[^ a-zA-Z0-9]', '', tweet)
    # remove Numbers
    tweet = re.sub('[0-9]', '', tweet)
    return tweet
data['text'] = data['text'].apply(clean_tweets)

In [10]:
def tokenize(text):
    return word_tokenize(text)
data['tokenized'] = data['text'].apply(tokenize)
data.head()

Unnamed: 0,text,sentiment,tokenized
0,last session day,neutral,"[last, session, day]"
1,shanghai also really exciting precisely skysc...,positive,"[shanghai, also, really, exciting, precisely, ..."
2,recession hit veronique branquinho quit compan...,negative,"[recession, hit, veronique, branquinho, quit, ..."
3,happy bday,positive,"[happy, bday]"
4,like it,positive,"[like, it]"


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
# create the transform
cv = TfidfVectorizer(max_features=5000) 
x = cv.fit_transform(data['text'].tolist()).toarray()

In [12]:
data.sentiment[data.sentiment == 'positive'] = 1
data.sentiment[data.sentiment == 'neutral'] = 0
data.sentiment[data.sentiment == 'negative'] = 2
y=data['sentiment'].astype('float')
print(y)

0       0.0
1       1.0
2       2.0
3       1.0
4       1.0
       ... 
3529    2.0
3530    1.0
3531    2.0
3532    1.0
3533    1.0
Name: sentiment, Length: 3534, dtype: float64


In [13]:
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import expon,uniform,randint

#Sklearn imports
from sklearn import linear_model
from sklearn.model_selection import train_test_split,RandomizedSearchCV,cross_val_score,cross_val_predict,validation_curve
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0) #Basic train_test_split works here
y_test.shape

(884,)

In [15]:
from sklearn.ensemble import RandomForestClassifier
m = RandomForestClassifier(n_estimators=40, min_samples_leaf=3, n_jobs=-1)
m.fit(x_train, y_train)
predictions = m.predict(x_test)
print(classification_report(y_test, predictions))
print("Accuracy :",accuracy_score(y_test, predictions),'\n')

              precision    recall  f1-score   support

         0.0       0.55      0.75      0.64       355
         1.0       0.73      0.64      0.68       273
         2.0       0.68      0.43      0.53       256

    accuracy                           0.62       884
   macro avg       0.66      0.61      0.62       884
weighted avg       0.65      0.62      0.62       884

Accuracy : 0.6244343891402715 



In [16]:
# With single train test split
clf = LogisticRegression(solver='liblinear').fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6357466063348416
[[286  35  34]
 [103 165   5]
 [128  17 111]]
              precision    recall  f1-score   support

         0.0       0.55      0.81      0.66       355
         1.0       0.76      0.60      0.67       273
         2.0       0.74      0.43      0.55       256

    accuracy                           0.64       884
   macro avg       0.68      0.61      0.63       884
weighted avg       0.67      0.64      0.63       884



In [17]:
## Multinomial NB with single train test split
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB().fit(x_train, y_train)

y_pred=nb_model.predict(x_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.5984162895927602
[[289  37  29]
 [112 159   2]
 [165  10  81]]
              precision    recall  f1-score   support

         0.0       0.51      0.81      0.63       355
         1.0       0.77      0.58      0.66       273
         2.0       0.72      0.32      0.44       256

    accuracy                           0.60       884
   macro avg       0.67      0.57      0.58       884
weighted avg       0.65      0.60      0.58       884



In [18]:
from xgboost import XGBClassifier
xgb = XGBClassifier().fit(x_train, y_train)

y_pred=xgb.predict(x_test)

print("Accuracy\n",accuracy_score(y_test, y_pred))
print("\n\nConfusion Matrix\n",confusion_matrix(y_test, y_pred))
print("\n\nClassification Report\n",classification_report(y_test, y_pred))

Accuracy
 0.6278280542986425


COnfusion Matrix
 [[261  43  51]
 [ 87 176  10]
 [120  18 118]]


Classification Report
               precision    recall  f1-score   support

         0.0       0.56      0.74      0.63       355
         1.0       0.74      0.64      0.69       273
         2.0       0.66      0.46      0.54       256

    accuracy                           0.63       884
   macro avg       0.65      0.61      0.62       884
weighted avg       0.64      0.63      0.62       884

