In [1]:
import pandas as pd
import re
import pickle
import numpy as np

# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# nltk
from nltk.stem import WordNetLemmatizer

# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from nltk.corpus import stopwords

In [44]:
columns=["target", "id", "date", "flag", "user", "text"]
dataset=pd.read_csv('twitter_data.csv',encoding="ISO-8859-1",names=columns)

In [5]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


This is the sentiment140 dataset.<br>
It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 2 = neutral, 4 = positive) and they can be used to detect sentiment .
It contains the following 6 fields:<br>
1.target<br>
2.id<br>
3.date<br>
4.flag<br>
5.user<br>
6.text<br>
we need only target and text column so we removing all other columns

In [6]:
dataset.drop(["id","date","flag","user"],axis=1).head(2)

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...


In [7]:
dataset["target"].value_counts()

0    800000
4    800000
Name: target, dtype: int64

target 0 :- negative <br>
target 4 :- positive

## preprocessing of data

In [8]:
text=list(dataset["text"])

In [10]:
#dictionary of all emojis with their meaning
emoji={':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [11]:
stopword=stopwords.words('english')

In [12]:

def preprocess(text):
    clean_text=[]
    word_lemmatizer=WordNetLemmatizer()
    for tweet in text:
        tweet=tweet.lower()
        tweet=re.sub(r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)",' URL',tweet)
        for e in emoji.keys():
            tweet.replace(e,"emoji"+emoji[e])
        tweet=re.sub('@[^\s]+',' user',tweet)
        tweet=re.sub("[^a-zA-Z0-9]"," ",tweet)
        tweet = re.sub(r"(.)\1\1+",r"\1\1",tweet)
        tweetwords = ''
        for word in tweet.split():
            if len(word)>1:
                word = word_lemmatizer.lemmatize(word)
                tweetwords += (word+' ')

        clean_text.append(tweetwords)
    return clean_text

In [13]:
import time
t = time.time()
processedtext=preprocess(text)
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

Text Preprocessing complete.
Time Taken: 159 seconds


## splitting dataset

In [14]:
x=processedtext
y=list(dataset["target"])

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

## Bag of Words

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
bow=CountVectorizer()
bow_x_train=bow.fit_transform(x_train)
bow_x_test=bow.transform(x_test)

In [18]:
def Evaluate(model):
    y_pred = model.predict(bow_x_test)
    print(classification_report(y_test, y_pred))
    matrix = confusion_matrix(y_test, y_pred)
    print("Confusion matrix")
    print(matrix)

## applying tfidf Vectorizer

In [19]:
tfidf=TfidfVectorizer()
tfidf_x_train=tfidf.fit_transform(x_train)

In [20]:
tfidf_x_test=tfidf.transform(x_test)

In [21]:
def Evaluate(model):
    y_pred = model.predict(tfidf_x_test)
    print(classification_report(y_test, y_pred))
    matrix = confusion_matrix(y_test, y_pred)
    print("Confusion matrix")
    print(matrix)

## Bernoulli Naive Bayes using tfidf

In [22]:
BNBmodel = BernoulliNB(alpha = 2)
BNBmodel.fit(tfidf_x_train,y_train)

BernoulliNB(alpha=2)

In [23]:
Evaluate(BNBmodel)

              precision    recall  f1-score   support

           0       0.78      0.78      0.78    200446
           4       0.78      0.78      0.78    199554

    accuracy                           0.78    400000
   macro avg       0.78      0.78      0.78    400000
weighted avg       0.78      0.78      0.78    400000

Confusion matrix
[[155617  44829]
 [ 43440 156114]]


## Bernoulli NB with BOW

In [24]:
BNBmodel.fit(bow_x_train,y_train)

BernoulliNB(alpha=2)

In [25]:
Evaluate(BNBmodel)

              precision    recall  f1-score   support

           0       0.78      0.78      0.78    200446
           4       0.78      0.78      0.78    199554

    accuracy                           0.78    400000
   macro avg       0.78      0.78      0.78    400000
weighted avg       0.78      0.78      0.78    400000

Confusion matrix
[[155617  44829]
 [ 43440 156114]]


## Linear SVC model with Tfidf

In [26]:
SVC_model=LinearSVC()
SVC_model.fit(tfidf_x_train,y_train)

LinearSVC()

In [27]:
Evaluate(SVC_model)

              precision    recall  f1-score   support

           0       0.80      0.79      0.79    200446
           4       0.79      0.80      0.80    199554

    accuracy                           0.79    400000
   macro avg       0.79      0.79      0.79    400000
weighted avg       0.79      0.79      0.79    400000

Confusion matrix
[[157405  43041]
 [ 39433 160121]]


## Linear SVC with BOW

In [28]:
SVC_model.fit(bow_x_train,y_train)

LinearSVC()

In [29]:
Evaluate(SVC_model)

              precision    recall  f1-score   support

           0       0.80      0.71      0.75    200446
           4       0.74      0.82      0.78    199554

    accuracy                           0.76    400000
   macro avg       0.77      0.76      0.76    400000
weighted avg       0.77      0.76      0.76    400000

Confusion matrix
[[141509  58937]
 [ 35232 164322]]


## Logistic Regression with tfidf 

In [30]:
LR_model = LogisticRegression(C=2,max_iter=1000,n_jobs=-1)
LR_model.fit(tfidf_x_train, y_train)

LogisticRegression(C=2, max_iter=1000, n_jobs=-1)

In [31]:
Evaluate(LR_model)

              precision    recall  f1-score   support

           0       0.81      0.79      0.80    200446
           4       0.79      0.81      0.80    199554

    accuracy                           0.80    400000
   macro avg       0.80      0.80      0.80    400000
weighted avg       0.80      0.80      0.80    400000

Confusion matrix
[[158475  41971]
 [ 38243 161311]]


## Logistic Regression with BOW

In [32]:
LR_model.fit(bow_x_train,y_train)

LogisticRegression(C=2, max_iter=1000, n_jobs=-1)

In [33]:
Evaluate(LR_model)

              precision    recall  f1-score   support

           0       0.81      0.72      0.76    200446
           4       0.74      0.83      0.79    199554

    accuracy                           0.77    400000
   macro avg       0.78      0.77      0.77    400000
weighted avg       0.78      0.77      0.77    400000

Confusion matrix
[[143522  56924]
 [ 33372 166182]]


## Saving models

In [36]:
file = open('vectoriser.pickle','wb')
pickle.dump(tfidf, file)
file.close()

file = open('Sentiment-Linear SVC.pickle','wb')
pickle.dump(LR_model, file)
file.close()

file = open('Sentiment-LR.pickle','wb')
pickle.dump(SVC_model, file)
file.close()

## using trained model 

In [37]:
def load_model():
    file = open('vectoriser.pickle', 'rb')
    vectoriser = pickle.load(file)
    file.close()
    file = open('Sentiment-LR.pickle', 'rb')
    LRmodel = pickle.load(file)
    file.close()
    return vectoriser, LRmodel


In [38]:
def predict(tfidf, model, text):
    textdata=tfidf.transform(preprocess(text))
    sentiment=model.predict(textdata)
    data = []
    for text, pred in zip(text, sentiment):
        data.append((text,pred))
    df = pd.DataFrame(data, columns = ['text','result'])
    df = df.replace([0,4], ["Negative","Positive"])
    return df

In [39]:
if __name__=="__main__":
    text = ["I hate twitter",
            "May the Force be with you.",
            "I feel so good"]
    
    df = predict(tfidf, LR_model, text)
    print(df.head())

                         text    result
0              I hate twitter  Negative
1  May the Force be with you.  Positive
2              I feel so good  Positive


In [42]:
from prettytable import PrettyTable
myTable = PrettyTable(["model used", "accuracy(Tfidf)", "accuracy(bow)"])

myTable.add_row(["Bernoulli NB", "0.78", "0.78"])
myTable.add_row(["Linear SVC", "0.79", "0.76"])
myTable.add_row(["Logistic Regression", "0.80", "0.77"])

print(myTable)
  

+---------------------+-----------------+---------------+
|      model used     | accuracy(Tfidf) | accuracy(bow) |
+---------------------+-----------------+---------------+
|     Bernoulli NB    |       0.78      |      0.78     |
|      Linear SVC     |       0.79      |      0.76     |
| Logistic Regression |       0.80      |      0.77     |
+---------------------+-----------------+---------------+
