## Importing the libraries

In [20]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore", category= FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Importing the Dataset

In [23]:
data=pd.read_csv(r"/content/drive/MyDrive/saved model/sentiment.tsv", sep= '\t')
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [24]:
data=data.drop(columns=['id'])
data=data.rename(columns={'sentiment':'label', 'review':'body_text'})
data=data[0:3500]
data.head()

Unnamed: 0,label,body_text
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...


## Cleaning the Data

In [25]:
def remove_pattern(input_txt,pattern):
    r= re.findall(pattern, input_txt)
    for i in r:
        input_txt= re.sub(i, '', input_txt)
    return input_txt

## Remove twitter handles (@user)

In [26]:
data['tidy_tweet']=np.vectorize(remove_pattern)(data['body_text'],"@[\w]*")
data.head()

Unnamed: 0,label,body_text,tidy_tweet
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...","\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious 8...


## Remove special Characters, numbers, punctuations

In [27]:
data['tidy_tweet']=data['tidy_tweet'].str.replace("[^a-zA-Z#]"," ")

In [28]:
data.head()

Unnamed: 0,label,body_text,tidy_tweet
0,1,With all this stuff going down at the moment w...,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War of the Worlds by Timothy Hi...
2,0,The film starts with a manager (Nicholas Bell)...,The film starts with a manager Nicholas Bell ...
3,0,It must be assumed that those who praised this...,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy and wondrously unpretentious ...


## Tokenize tweet

In [29]:
tokenize_tweet= data['tidy_tweet'].apply(lambda x:x.split())
tokenize_tweet.head()

0    [With, all, this, stuff, going, down, at, the,...
1    [The, Classic, War, of, the, Worlds, by, Timot...
2    [The, film, starts, with, a, manager, Nicholas...
3    [It, must, be, assumed, that, those, who, prai...
4    [Superbly, trashy, and, wondrously, unpretenti...
Name: tidy_tweet, dtype: object

In [30]:
from nltk.stem.porter import*
stemmer= PorterStemmer()
tokenized_tweet= tokenize_tweet.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized_tweet.head()

0    [with, all, thi, stuff, go, down, at, the, mom...
1    [the, classic, war, of, the, world, by, timoth...
2    [the, film, start, with, a, manag, nichola, be...
3    [it, must, be, assum, that, those, who, prais,...
4    [superbl, trashi, and, wondrous, unpretenti, s...
Name: tidy_tweet, dtype: object

## Joining the tokenized word in the same data

In [31]:
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i]= ' '.join(tokenized_tweet[i])
data['tidy_tweet']=tokenized_tweet
data.head()

Unnamed: 0,label,body_text,tidy_tweet
0,1,With all this stuff going down at the moment w...,with all thi stuff go down at the moment with ...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war of the world by timothi hine i...
2,0,The film starts with a manager (Nicholas Bell)...,the film start with a manag nichola bell give ...
3,0,It must be assumed that those who praised this...,it must be assum that those who prais thi film...
4,1,Superbly trashy and wondrously unpretentious 8...,superbl trashi and wondrous unpretenti s explo...


## Adding other column for lenght of the tweet and punctuation

In [32]:
def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-(text).count(" ")),3)*100

In [33]:
data['body_len']= data['body_text'].apply(lambda x:len(x)- x.count(" "))
data['punct%']= data['body_text'].apply(lambda x:count_punct(x))
data.head()

Unnamed: 0,label,body_text,tidy_tweet,body_len,punct%
0,1,With all this stuff going down at the moment w...,with all thi stuff go down at the moment with ...,1870,3.6
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war of the world by timothi hine i...,789,5.3
2,0,The film starts with a manager (Nicholas Bell)...,the film start with a manag nichola bell give ...,2072,3.4
3,0,It must be assumed that those who praised this...,it must be assum that those who prais thi film...,1867,5.0
4,1,Superbly trashy and wondrously unpretentious 8...,superbl trashi and wondrous unpretenti s explo...,1865,3.5


## Feature  selection and engineering

### 1) Count Vectorizer
### 2) Tdidf Vectoriser

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer= CountVectorizer()
bow= bow_vectorizer.fit_transform(data['tidy_tweet'])
X_count_feat= pd.concat([data['body_len'], data['punct%'], pd.DataFrame(bow.toarray())], axis=1)
X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,22581,22582,22583,22584,22585,22586,22587,22588,22589,22590
0,1870,3.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,789,5.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2072,3.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1867,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1865,3.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
X=X_count_feat
y=data['label']

In [38]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(C=1)
clf=clf.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [41]:
message= "Hello this is a very good mom"
data= [message]
vect= (bow_vectorizer.transform(data))
vect= pd.DataFrame(vect.toarray())
body_len=pd.DataFrame([len(data) - data.count(" ")])
punct= pd.DataFrame([count_punct(data)])
total_data= pd.concat([body_len, punct,vect], axis=1)
total_data

Unnamed: 0,0,0.1,0.2,1,2,3,4,5,6,7,...,22581,22582,22583,22584,22585,22586,22587,22588,22589,22590
0,1,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
my_prediction= clf.predict(total_data)
my_prediction

array([1])

In [43]:
import pickle
filename = '/content/drive/MyDrive/saved model/finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [44]:
import pickle
filename = '/content/drive/MyDrive/saved model/vectorizer.sav'
pickle.dump(bow_vectorizer, open(filename, 'wb'))