## 1.Installation and Imports

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import re
from collections import Counter
import pickle
import os
import numpy as np

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.pipeline import Pipeline

  import pandas.util.testing as tm


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

## 2.Data Preperation

>### i. Reading the data

In [0]:
df_path="/content/drive/My Drive/ML_data/TwitterSentiment/tweets.csv"
model_path="/content/drive/My Drive/ML_data/TwitterSentiment/sentiment.sav"

In [0]:
cols = ["target", "ids", "date", "flag", "user", "text"]
enc = "ISO-8859-1"
df = pd.read_csv(df_path,encoding=enc,names=cols)
df.head()

>### ii. Normalizing the tweets

In [0]:
#tweet_1=df.loc[1]["tweet"]
def normalize_tweet(tweet):
  
  #convert the tweet to lower case
  tweet.lower()

  #convert all urls to sting "URL"
  tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)

  #correct all multiple white spaces and punctuations to a single white space/punctuation
  tweet = re.sub('\.{2,}', ' ', tweet)
  tweet = re.sub('[\s]+', ' ', tweet)
  tweet = re.sub('\!{2,}', '!', tweet)

  #convert "#topic" to just "topic"
  tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

  #Extracting words(tokens) from the tweet
  twt_tknz=TweetTokenizer(strip_handles=True)
  tkns=twt_tknz.tokenize(tweet)

  #Removing stop words
  stop_words = set(stopwords.words('english'))
  word_list=[tkn for tkn in tkns if tkn not in stop_words]

  #Using Rule Based Stemmer to find word stems
  stemmer=PorterStemmer()
  stems=[stemmer.stem(word) for word in word_list]

  #Creating a sentence from the stems
  norm_tweet=" ".join(stems)

  return norm_tweet

df["norm_tweet"]=df["text"].apply(normalize_tweet)
df.head()

Unnamed: 0,target,ids,date,flag,user,text,norm_tweet
828240,4,1556986747,Sun Apr 19 01:23:05 PDT 2009,NO_QUERY,harajukuroxy,@AndyTaylorSonic sounds lovely,sound love
459644,0,2072226833,Sun Jun 07 20:37:38 PDT 2009,NO_QUERY,naomilayne13,why me?,?
149251,0,1883378662,Fri May 22 08:27:28 PDT 2009,NO_QUERY,DaMaHug,@GinaATL Generally phones and water don't get ...,gener phone water get well togeth . My phone l...
183195,0,1967361744,Fri May 29 19:26:27 PDT 2009,NO_QUERY,jaleesie,@basantam Oh goshhhh i forgot to watch it! stu...,Oh goshhhh forgot watch ! stuep
1464784,4,2064280408,Sun Jun 07 05:48:39 PDT 2009,NO_QUERY,LaurieleeWaul,@x3mrspattinson Im Reading Twilight now you s...,Im read twilight proud !


>### iii. Splitting into train, validation and test dataset

In [0]:
X=df["norm_tweet"]
y=df["target"] #Negative(0) Positive(4)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=26)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1,random_state=91)

## 3. The Model

>### i. Creation

In [0]:
if(os.path.exists(model_path)):
  #load 
  pipeline = pickle.load(open(model_path, 'rb'))
else:
  #create-train-save
  pipeline=Pipeline(steps=[
                         ("tfIdf",TfidfVectorizer()),
                         ("NB",MultinomialNB())                         
  ])
  pipeline.fit(X_train,y_train)
  pickle.dump(pipeline, open(model_path, 'wb'))

>### ii. Evaluation

In [0]:
y_pred=pipeline.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
cm

array([[377, 114],
       [114, 395]])

In [0]:
accuracy_score(y_test,y_pred)

0.772

>### iii. Prediction

In [0]:
tweets=["I am a good boy","I hate him","llllllluuuuuuuuudfreee"]
norm_tweets=[normalize_tweet(aa) for aa in tweets]
pipeline.predict_proba(norm_tweets)[:,1]

array([0.66485895, 0.11983759, 0.50013966])