In [99]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
# import spacy
import re
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC
# from textblob import TextBlob
# from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split # Import the necessary function


In [100]:
imdb_data = pd.read_csv('IMDB Dataset.csv')

# Original row count
original_row_count = len(imdb_data)

# Filter out rows with irrelevant or neutral sentiments
filtered_data = imdb_data[~imdb_data["sentiment"].isin(["Irrelevant", "Neutral"])]

# Filtered row count
filtered_row_count = len(filtered_data)

# Calculate and display the number of rows removed
rows_removed = original_row_count - filtered_row_count
print(f"Rows removed: {rows_removed}")

# Assign sentiments and reviews from the filtered data
sentiments = filtered_data["sentiment"]
reviews = filtered_data["review"]

Rows removed: 31308


In [101]:
# Download NLTK resources
nltk.download('stopwords')
stop = set(stopwords.words('english'))
ps = PorterStemmer()
stopword_list = nltk.corpus.stopwords.words('english')
tokenizer = ToktokTokenizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Define functions
def remove_html(text):
    """Remove HTML tags from the text."""
    if not isinstance(text, str):
        return ""
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def remove_brackets(text):
    """Remove content inside brackets from the text."""
    return re.sub(r'\[[^]]*\]', '', text)

def remove_special_characters(text):
    """Remove special characters and punctuation."""
    if not isinstance(text, str):
        return ""
    pattern = r'[^a-zA-Z0-9\s]'
    return re.sub(pattern, '', text)

def tokenize_text(text):
    """Tokenize the text into words."""
    if not isinstance(text, str):
        return []
    return word_tokenize(text)

def remove_stopwords(text, is_lower_case=False):
    """Remove stopwords from the text."""
    tokens = tokenize_text(text)
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    return ' '.join(filtered_tokens)

def stem_text(text):
    """Apply stemming to the text."""
    tokens = tokenize_text(text)
    return ' '.join([ps.stem(word) for word in tokens])

# Preprocess reviews
def preprocess_review(text):
    """Apply all preprocessing steps to the text."""
    text = remove_html(text)
    text = remove_brackets(text)
    text = remove_special_characters(text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

# Apply preprocessing to the reviews
reviews = reviews.fillna("")  # Replace NaN values with an empty string
reviews = reviews.apply(preprocess_review)

  soup = BeautifulSoup(text, 'html.parser')


In [None]:
# Split data into training and testing sets
train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(
    reviews, sentiments, test_size=0.1, random_state=42, shuffle=True
)

In [97]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0.0,max_df=1.0,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(test_reviews)

In [98]:
print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (84036, 7908551)
BOW_cv_test: (9338, 7908551)


In [None]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0.0,max_df=1.0,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (74699, 7114771)
Tfidf_test: (18675, 7114771)


In [77]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(sentiments)
print(sentiment_data.shape)

(93374, 1)


In [88]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_sentiments)
print(lr_tfidf)

KeyboardInterrupt: 

In [None]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

['negative' 'positive' 'positive' ... 'positive' 'positive' 'negative']
['negative' 'positive' 'positive' ... 'positive' 'positive' 'positive']


In [80]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.8620615796519411
lr_tfidf_score : 0.8957965194109773


In [None]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)



              precision    recall  f1-score   support

    Positive       0.87      0.86      0.86      9529
    Negative       0.85      0.87      0.86      9146

    accuracy                           0.86     18675
   macro avg       0.86      0.86      0.86     18675
weighted avg       0.86      0.86      0.86     18675

              precision    recall  f1-score   support

    Positive       0.89      0.91      0.90      9529
    Negative       0.90      0.88      0.89      9146

    accuracy                           0.90     18675
   macro avg       0.90      0.90      0.90     18675
weighted avg       0.90      0.90      0.90     18675



In [None]:
#training the linear svm
svm=SGDClassifier(loss='hinge',max_iter=500,random_state=42)
#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,train_sentiments)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,train_sentiments)
print(svm_tfidf)

SGDClassifier(max_iter=500, random_state=42)
SGDClassifier(max_iter=500, random_state=42)


In [None]:
import pickle

# Save the Logistic Regression model trained on Bag of Words
with open('SVM_bow.pkl', 'wb') as file:
    pickle.dump(lr_bow, file)

# Save the Logistic Regression model trained on TF-IDF features
with open('SVM_tfidf.pkl', 'wb') as file:
    pickle.dump(lr_tfidf, file)

# Save the CountVectorizer used for Bag of Words
with open('count_vectorizer.pkl', 'wb') as file:
    pickle.dump(cv, file)

# Save the TfidfVectorizer used for TF-IDF features
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tv, file)

# Save the LabelBinarizer used for sentiment labels
with open('label_binarizer.pkl', 'wb') as file:
    pickle.dump(lb, file)

print("\nModels and vectorizers have been saved successfully.")



Models and vectorizers have been saved successfully.


In [None]:
# Load models and vectorizers
with open('SVM_bow.pkl', 'rb') as file:
    lr_bow = pickle.load(file)

with open('SVM_tfidf.pkl', 'rb') as file:
    lr_tfidf = pickle.load(file)

with open('count_vectorizer.pkl', 'rb') as file:
    cv = pickle.load(file)

with open('tfidf_vectorizer.pkl', 'rb') as file:
    tv = pickle.load(file)

with open('label_binarizer.pkl', 'rb') as file:
    lb = pickle.load(file)

# # Example usage
def preprocess_input(text):
    # Apply the same preprocessing steps
    text = remove_html(text)
    text = remove_brackets(text)
    text = remove_special_characters(text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

# Sample input
sample_review = "This movie was NOT GOOD!"
preprocessed_review = preprocess_input(sample_review)

# Transform input using vectorizers
bow_features = cv.transform([preprocessed_review])
tfidf_features = tv.transform([preprocessed_review])

# Predict sentiment using models
bow_prediction = lr_bow.predict(bow_features)
tfidf_prediction = lr_tfidf.predict(tfidf_features)

print("Prediction (BOW):", bow_prediction)
print("Prediction (TF-IDF):", tfidf_prediction)


Prediction (BOW): ['positive']
Prediction (TF-IDF): ['positive']
