In [4]:
# Load Libraries and Install Missing Dependencies
!pip install pandas scikit-learn nltk

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import nltk

nltk.download('stopwords')


Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/somilvishwakarma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Load and Explore the Dataset
df = pd.read_csv('/home/somilvishwakarma/Desktop/College/Sem 1/Foundation of DS/Project/IMDB Dataset.csv')

# Display dataset info
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
# Define Processing Functions
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase and split
    text = text.lower().split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    return ' '.join(text)


In [7]:
df['cleaned_review'] = df['review'].apply(clean_text)
df.head()


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode ho...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


In [8]:
# Split the dataset
X = df['cleaned_review']
y = df['sentiment'].map({'positive': 1, 'negative': 0})  # Encode labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")


Train size: 40000, Test size: 10000


In [9]:
# Text Vectorisation
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"Train shape: {X_train_tfidf.shape}, Test shape: {X_test_tfidf.shape}")


Train shape: (40000, 5000), Test shape: (10000, 5000)


In [10]:
# MOdel Training
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Evaluate on test data
y_pred = model.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Accuracy: 0.8915
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [11]:
#Load preprocesing or model
import pickle

# Save vectorizer and model
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Load them later
with open('vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)
with open('model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)


In [15]:
# Final Testing
def predict_sentiment(review, vectorizer, model):
    cleaned_review = clean_text(review)
    transformed_review = vectorizer.transform([cleaned_review])
    prediction = model.predict(transformed_review)
    return 'Positive' if prediction[0] == 1 else 'Negative'

# Test
print(predict_sentiment("The movie was bad!", vectorizer, model))
print(predict_sentiment("It was a horrible experience.", vectorizer, model))


Negative
Negative
