## 1.Installation and Imports

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import re
from collections import Counter
import pickle
import os
import numpy as np

import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import TweetTokenizer,word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
from google.colab import drive
drive.mount('/content/drive')

## 2.Data Preperation

>### i. Reading the data

In [0]:
df_path="/content/drive/My Drive/ML_data/TwitterQuestion/interrogative.csv"
model_path="/content/drive/My Drive/ML_data/TwitterQuestion/interrogative.sav"

In [0]:
df = pd.read_csv(df_path)
df.head()

Unnamed: 0,user,text,target
0,10-19-20sUser7,now im left with this gay name,0
1,10-19-20sUser115,ah well,0
2,10-19-20sUser21,26/ m/ ky women that are nice please pm me,0
3,10-19-20sUser115,there ya go 10-19-20sUser7,0
4,10-19-20sUser59,whats everyone up to?,1


>### ii. Normalizing the tweets

In [0]:
#tweet_1=df.loc[1]["tweet"]
def normalize_tweet(tweet):
  
  #convert the tweet to lower case
  tweet.lower()

  #convert all urls to sting "URL"
  tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)

  #correct all multiple white spaces and punctuations to a single white space/punctuation
  tweet = re.sub('\.{2,}', ' ', tweet)
  tweet = re.sub('[\s]+', ' ', tweet)
  tweet = re.sub('\!{2,}', '!', tweet)

  #convert "#topic" to just "topic"
  tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

  #Extracting words(tokens) from the tweet
  twt_tknz=TweetTokenizer(strip_handles=True)
  tkns=twt_tknz.tokenize(tweet)
  #tkns=word_tokenize(tweet)

  #Removing stop words
  stop_words = set(stopwords.words('english'))
  word_list=[tkn for tkn in tkns if tkn not in stop_words]

  #Using Rule Based Stemmer to find word stems
  stemmer=PorterStemmer()
  stems=[stemmer.stem(word) for word in word_list]

  #Creating a sentence from the stems
  norm_tweet=" ".join(stems)

  return norm_tweet

df["norm_tweet"]=df["text"].apply(normalize_tweet)
df.head()

Unnamed: 0,user,text,target,norm_tweet
0,10-19-20sUser7,now im left with this gay name,0,im left gay name
1,10-19-20sUser115,ah well,0,ah well
2,10-19-20sUser21,26/ m/ ky women that are nice please pm me,0,26 / / ky women nice pleas pm
3,10-19-20sUser115,there ya go 10-19-20sUser7,0,ya go 10-19- 20suser7
4,10-19-20sUser59,whats everyone up to?,1,what everyon ?


>### iii. Splitting into train, validation and test dataset

In [0]:
X=df["norm_tweet"]
y=df["target"] #Statement(0) Question(1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=26)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1,random_state=91)

## 3. The Model

>### i. Creation

In [0]:
if(os.path.exists(model_path)):
  #load 
  pipeline = pickle.load(open(model_path, 'rb'))
else:
  #create-train-save
  pipeline=Pipeline(steps=[
                         ("tfIdf",TfidfVectorizer()),
                         ("NB",MultinomialNB())                         
  ])
  pipeline.fit(X_train,y_train)
  pickle.dump(pipeline, open(model_path, 'wb'))



>### ii. Evaluation

In [0]:
y_pred=pipeline.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
cm

array([[306,   3],
       [ 99,  25]])

In [0]:
y_pred_prob=pipeline.predict_proba(X_test)
y_pred=list()
for prb in y_pred_prob:
    if(prb[1]>0.25):
        y_pred.append(1)
    else:
        y_pred.append(0)

In [0]:
accuracy_score(y_test.values,y_pred)

0.6882217090069284

In [0]:
cm=confusion_matrix(y_test,y_pred)
cm

array([[219,  90],
       [ 45,  79]])

>### iii. Prediction

In [0]:
tweets=["I am a good boy","is this a job ?","Let us play a game"]
norm_tweets=[normalize_tweet(aa) for aa in tweets]
pipeline.predict_proba(norm_tweets)[:,1]

array([0.06738288, 0.30278762, 0.05669662])