<a href="https://colab.research.google.com/github/roshi45/Movie_review_SentimentAnalysis/blob/main/Movie_Review1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and digits
    text = re.sub(f"[{string.punctuation}]", '', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize and remove stopwords
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Rejoin the processed words
    return ' '.join(words)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/BDA/Movie Review/IMDB Dataset.csv")

In [None]:
df['review'] = df['review'].apply(preprocess_text)

In [None]:
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [None]:
X = df['review']
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Convert text data into TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

**SVM**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [None]:
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred_svm = svm_model.predict(X_test_tfidf)

In [None]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Model Accuracy: {accuracy_svm}")

SVM Model Accuracy: 0.8794


**RANDOM FOREST**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [None]:
y_pred_rf = rf_model.predict(X_test_tfidf)

In [None]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf}")

Random Forest Model Accuracy: 0.8482


**LOGISTIC REGRESSION**

In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8849


In [None]:
example_review = ["This movie was amazing!"]
example_tfidf = tfidf.transform(example_review)
prediction = model.predict(example_tfidf)
print(f"Predicted Sentiment: {'positive' if prediction[0] == 1 else 'negative'}")


Predicted Sentiment: positive


In [None]:
joblib.dump(model, 'logistic_regression_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']