# Model Prepration

### Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd

### Importing Cleaned data

In [2]:
df = pd.read_csv('final_cleaned_data.csv')
df.sample(5)

Unnamed: 0,new_clean_tweet,category
26505,sad modijiyou complet oblivi pain suffer commo...,-1.0
21586,counter shahid social front environ protect mi...,1.0
84307,yup loo elect buzz zing gone mani vika thingi ...,1.0
38681,noth desper congress comeback power save caree...,1.0
68434,actual funni thing first time voter didnt know...,1.0


In [3]:
df.shape

(162897, 2)

In [4]:
df.isna().sum()

new_clean_tweet    2
category           0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isna().sum()

new_clean_tweet    0
category           0
dtype: int64

In [7]:
df.shape

(162895, 2)

### Separating out train-test sets

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [92]:
# X ad Y splitting
X = df['new_clean_tweet']
Y = df['category'].values

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=43, stratify=Y)

In [99]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((122171,), (40724,), (122171,), (40724,))

## Multinomial NB

### 1. Training model using TfidfVectorizer

In [100]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])
nb_model = pipeline.fit(X_train, y_train)

In [101]:
y_pred = nb_model.predict(X_test)

In [102]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print('*'*50)
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.90      0.12      0.21      8877
         0.0       0.83      0.33      0.47     13786
         1.0       0.51      0.97      0.67     18061

    accuracy                           0.57     40724
   macro avg       0.75      0.47      0.45     40724
weighted avg       0.71      0.57      0.50     40724

**************************************************
[[ 1042   464  7371]
 [   82  4494  9210]
 [   36   445 17580]]


### 2. Training model using CountVectorizer

In [112]:
npipeline = Pipeline([('vect', CountVectorizer()),
                  ('model',MultinomialNB()), ])

In [113]:
count_nb_model = npipeline.fit(X_train, y_train)

In [114]:
# Make predictions
y_pred = count_nb_model.predict(X_test)

In [115]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print('*'*50)
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.70      0.56      0.62      8877
         0.0       0.81      0.56      0.66     13786
         1.0       0.66      0.87      0.75     18061

    accuracy                           0.70     40724
   macro avg       0.72      0.66      0.68     40724
weighted avg       0.71      0.70      0.69     40724

**************************************************
[[ 4951   785  3141]
 [  911  7748  5127]
 [ 1260  1090 15711]]


## SVM

In [123]:
from sklearn import svm
from sklearn.svm import LinearSVC

In [125]:
pipeline = Pipeline([('vect', CountVectorizer()),
                  ('model',LinearSVC()), ])

In [126]:
svm_model = pipeline.fit(X_train, y_train)



In [127]:
# Make predictions
y_pred = svm_model.predict(X_test)

In [128]:
print(classification_report(y_test, y_pred))
print('*'*50)
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.79      0.74      0.76      8877
         0.0       0.83      0.89      0.86     13786
         1.0       0.87      0.85      0.86     18061

    accuracy                           0.84     40724
   macro avg       0.83      0.83      0.83     40724
weighted avg       0.84      0.84      0.84     40724

**************************************************
[[ 6595   993  1289]
 [  588 12252   946]
 [ 1184  1579 15298]]


## Logistic Regression

In [129]:
from sklearn.linear_model import LogisticRegression

In [130]:
pipeline = Pipeline([('vect', CountVectorizer()),
                  ('model',LogisticRegression(random_state=0))])

In [131]:
lr_model = pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [132]:
# Make predictions
y_pred = lr_model.predict(X_test)

In [133]:
print(classification_report(y_test, y_pred))
print('*'*50)
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.81      0.75      0.78      8877
         0.0       0.83      0.91      0.87     13786
         1.0       0.89      0.85      0.87     18061

    accuracy                           0.85     40724
   macro avg       0.84      0.84      0.84     40724
weighted avg       0.85      0.85      0.85     40724

**************************************************
[[ 6629  1012  1236]
 [  439 12591   756]
 [ 1159  1509 15393]]


## Random forest

In [134]:
from sklearn.ensemble import RandomForestClassifier

In [145]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('clf', RandomForestClassifier(n_estimators=250, max_depth=5, random_state=0))])

In [146]:
rf_model = pipeline.fit(X_train, y_train)

In [147]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [148]:
print(classification_report(y_test, y_pred))
print('*'*50)
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00      8877
         0.0       0.00      0.00      0.00     13786
         1.0       0.44      1.00      0.61     18061

    accuracy                           0.44     40724
   macro avg       0.15      0.33      0.20     40724
weighted avg       0.20      0.44      0.27     40724

**************************************************
[[    0     0  8877]
 [    0     0 13786]
 [    0     0 18061]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Conclusion

- After Spot Checking each Classification algorithm we can confirm that SVC classifier is performing best on this dataset becasue of goof F1 score for each category.
- Now we will Tune the hyperparameters of SVC classifier to get the best model accuracy for each class.

## Buliding final model to get predection using SVC 

In [149]:
import nltk
import re
import pickle
from emot.emo_unicode import UNICODE_EMOJI
import os
import csv
from nltk.stem.snowball import SnowballStemmer
from nltk import sent_tokenize, word_tokenize
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd
import string

from wordcloud import WordCloud
import matplotlib.pyplot as plt
st_wrds = stopwords.words('english')
stemmer = SnowballStemmer('english')
lemma = WordNetLemmatizer()

In [150]:
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(": ","").split()))
    return text

#create a function to clean the tweets
def clean_text(text):
    text = convert_emojis(text) #calling the convert_emojis function to convert all the emojis to text at first place.
    text = str(text)
    text = text.lower() #converting every words to lowercase
    text = re.sub(r'@[A-Za-z0-9_-]+', '', text) #substituing the @user_handle with empty string
    text = re.sub(r'#', '', text) #Remove the '#' symbol
    text = re.sub(r'RT[\s]+', '', text) #Removing retweets RT
    text = re.sub(r'https?:\/\/\S+', '', text) #removing the hyperlink
    text = re.sub(r'[0-9]+', '', text) #Removing numbers from the text
    
    words = word_tokenize(text) #Splitting each words in a sentence
    
    #use regular expressions to select for the punctuation characters and use the sub() function to replace them with nothing.
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    text = [re_punc.sub('', w) for w in words]
    
    #removing stop words
    text = [word for word in text if not word in st_wrds]
    
    #removing text with length 1
    text = [word for word in text if len(word) > 1 ]
      
    text = ' '.join([elem for elem in text]) #Converting the whole text into a string
    text = re.sub(' +', ' ', text) #Removing extra spaces from the text
    
    return text


def stem_lem_words(text):
    sent = []
    for word in text.split():
        #Applying Stemming then Applying lemmatization
        i = lemma.lemmatize(stemmer.stem(word))
        sent.append(i)
        
    text = ' '.join([elem for elem in sent]) #Converting the whole text into a string
    text = re.sub(' +', ' ', text) 
    return text
    
    print(result)
    
    
def preprocess(text):
    text = clean_text(text) # First preprocessing the data
    text = stem_lem_words(text) #Finally stemming and lemmatizing the words
     
    return text

### Function to predict the label of input text

In [172]:
def tweet_model(text):
    text = preprocess(text)
    text = [text]
    output = svm_model.predict(text)
    if output == 0:
        print('Neutral')
    elif output == 1:
        print('Positive')
    elif output == -1:
        print('Negative')

#### Example 1

In [173]:
tweet_model('I will vote for modi 😊')

Neutral


In [174]:
tweet_model('Very good work done by BJP here in Lucknow')

Positive


In [178]:
tweet_model('very bad behaviour by other party 😡 @narendramodi #stopvoilence')

Negative
