# Key Notes

**A - What are the things we do in text preprocessing?**


1.   Convert text in lower cases
2.   Remove numbers, special characters and punctuations, fullstops commas
3.   Tokenisation
4.   Stemming and lemitisation.




**B - Then we convert these text into vectors (numbers) which is called embeddings.**


**C- Convert these vectors to train and test and feed this test data into models.**

**D - Calculate the cost to evaluate performance of the model.**

**Data Set Link**

[IMDB DataSet](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/download?datasetVersionNumber=1)

# Code

**Import Libaries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import nltk
import math
import re
from bs4 import BeautifulSoup
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import warnings
warnings.filterwarnings('ignore')

Importing Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB Dataset.csv')

In [None]:
df.replace(to_replace="positive",
           value="1", inplace = True)
df.replace(to_replace="negative",
           value="0", inplace = True)

In [None]:
df = df.iloc[:5000,:]

In [None]:
df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
df.shape

## Text Preprocessing :


In [None]:
#from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.toktok import ToktokTokenizer

In [None]:
#from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
tokenizer = ToktokTokenizer()

In [None]:
en_stopwords = set(stopwords.words('english'))

In [None]:
ps = WordNetLemmatizer()

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Apply function on review column
df['review']=df['review'].apply(strip_html)

#Apply function on review column
df['review']=df['review'].apply(remove_between_square_brackets)

#Apply function on review column
df['review']=df['review'].apply(remove_special_characters)

#Apply function on review column
df['review']=df['review'].apply(denoise_text)



In [None]:
def cleanText(text):
  text = text.lower()

  #Tokenize
  tokens = tokenizer.tokenize(text)

  #Removing blanks
  tokens = [token.strip() for token in tokens]

  #Removing Stopwords
  new_tokens = [token for token in tokens if token not in en_stopwords]

  #Stemming
  stemmed_token = [ps.lemmatize(token) for token in new_tokens]

  clean_text = " ".join(stemmed_token)

  return clean_text

In [None]:
#Apply function on review column
df['review']=df['review'].apply(cleanText)

#lst = [cleanText(i) for i in df['review']]

In [None]:
df['review']

# Vectorization

In [None]:
# Denoted as bag of words
from sklearn.feature_extraction.text import CountVectorizer

# Tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Count Vectorizer
cv = CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))

#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

In [None]:
y = df.iloc[:,1:]

In [None]:
x = df.iloc[:,:-1]

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test =  train_test_split(x.values, y.values, test_size = 0.40, random_state = 5)

In [None]:
#Count Vectorizer
cv_x_train = cv.fit_transform(x_train.ravel())

#transformed train reviews
tv_x_train =tv.fit_transform(x_train.ravel())


In [None]:
#Count Vectorizer
cv_x_test = cv.transform(x_test.ravel())

#transformed test reviews
tv_x_test=tv.transform(x_test.ravel())


In [None]:
print('Tfidf_train:',tv_x_train.shape)
print('Tfidf_test:',tv_x_test.shape)

In [None]:
from sklearn.preprocessing import LabelBinarizer

#labeling the sentient data
lb=LabelBinarizer()

#transformed sentiment data
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)

In [None]:
#y_train = y_train.values.ravel() # To convert y in 1-D Vector
#y_test = y_test.values.ravel()

In [None]:
print(cv.get_feature_names_out())

# Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb = MultinomialNB()

In [None]:
cv_nb = nb.fit(cv_x_train,y_train)

In [None]:
tv_nb = nb.fit(tv_x_train,y_train)

In [None]:
cv_y_pred = cv_nb.predict(cv_x_test)

In [None]:
tv_y_pred = tv_nb.predict(tv_x_test)

In [None]:
# Calculate the cost
cv_cost = -np.sum(np.log(nb.predict_proba(cv_x_test)))
tv_cost = -np.sum(np.log(nb.predict_proba(cv_x_test)))

In [None]:
cv_cost, tv_cost

### Check Accuracy for Multinomial Naive Byes

In [None]:
#Accuracy score for bag of words
cv_nb_score = accuracy_score(y_test, cv_y_pred)
print("CountVectorization score for MNB :",cv_nb_score)

#Accuracy score for tfidf features
tv_nb_score = accuracy_score(y_test, tv_y_pred)
print("TFID Vectorization score for MNB :",tv_nb_score)

### Testing Model by real time data

In [None]:
check = ['This movie was really bad', 'Wow what a movie', 'I think I prefer watching some other movie']

In [None]:
new_check = []
for i in check:
  new_check.append(cleanText(i))

In [None]:
cv_check_test = cv.transform(new_check)
tv_check_test = tv.transform(new_check)

In [None]:
cv_new_y_pred = nb.predict(cv_check_test)
tv_new_y_pred = nb.predict(tv_check_test)

In [None]:
cv_new_y_pred, tv_new_y_pred

### Classification report for mnb -

In [None]:
#Classification report for bag of words
mnb_cv_report = classification_report(y_test, cv_y_pred,target_names=['Positive','Negative'])
print(mnb_cv_report)


In [None]:
#Classification report for tfidf features
mnb_tv_report = classification_report(y_test, tv_y_pred,target_names=['Positive','Negative'])
print(mnb_tv_report)

### Confusion matrix for mnb

In [None]:
#confusion matrix for bag of words
cm_cv = confusion_matrix(y_test, cv_y_pred,labels=[1,0])
print(cm_cv)

In [None]:
#confusion matrix for tfidf features
cm_tfidf = confusion_matrix(y_test, tv_y_pred,labels=[1,0])
print(cm_tfidf)

# WordCloud for mnb

#### Word cloud for positive review words

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
x_train[1][0]

In [None]:
#word cloud for positive review words
plt.figure(figsize=(10,10))
positive_text=x_train[4][0]
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5)
positive_words=WC.generate(positive_text)
plt.imshow(positive_words,interpolation='bilinear')
plt.show

### Word cloud for negative review words

In [None]:
#Word cloud for negative review words
plt.figure(figsize=(10,10))
negative_text=x_train[2][0]
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5)
negative_words=WC.generate(negative_text)
plt.imshow(negative_words,interpolation='bilinear')
plt.show

# Support vector Classifier

In [None]:
from sklearn.svm import SVC

In [None]:
regressor = SVC(kernel = 'rbf')

In [None]:
cv_svr = regressor.fit(cv_x_train,y_train)

In [None]:
tv_svr = regressor.fit(tv_x_train,y_train)

In [None]:
svr_cv_y_pred = cv_svr.predict(cv_x_test)
svr_tv_y_pred = tv_svr.predict(tv_x_test)

In [None]:
svr_cv_y_pred

### Check Accuracy for Support Vector Classifier

In [None]:
#Accuracy score for bag of words
cv_svr_score = accuracy_score(y_test, svr_cv_y_pred)
print("CountVectorization score for MNB :",cv_svr_score)

#Accuracy score for tfidf features
tv_svr_score = accuracy_score(y_test, svr_tv_y_pred)
print("TFID Vectorization score for MNB :",tv_svr_score)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier

In [None]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

In [None]:
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_x_train,y_train)
print(lr_bow)

#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_x_train,y_train)
print(lr_tfidf)

In [None]:
lr_cv_y_pred = lr_bow.predict(cv_x_test)
lr_tv_y_pred = lr_tfidf.predict(tv_x_test)

In [None]:
lr_tv_y_pred

### Check Accuracy for Logistic Regression


In [None]:
#Accuracy score for bag of words
cv_lr_score = accuracy_score(y_test, lr_cv_y_pred)
print("CountVectorization score for MNB :",cv_lr_score)

#Accuracy score for tfidf features
tv_lr_score = accuracy_score(y_test, lr_tv_y_pred)
print("TFID Vectorization score for MNB :",tv_lr_score)

We can observed that both logistic regression and multinomial naive bayes model performing well compared to support vector machines.