# S E N T I M E N T   A N A L Y S I S      O N          M O V I E   R E V I E W S
# --------------------------------------------------------------------

## This script implements the full workflow for a sentiment analysis project,
## now upgraded to use a real dataset and interactive user input.


### ***--- Core Libraries ---***

In [1]:
import pandas as pd
import numpy as np
import re
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

### Suppress warnings for cleaner output

In [2]:
warnings.filterwarnings('ignore')

### ***--- 1. Data Loading & Preparation ---***

In [3]:
print("--- Data Loading & Preparation ---")
try:
    # Load the dataset. Make sure 'IMDB Dataset.csv' is in the same directory.
    df = pd.read_csv('IMDB Dataset.csv')
    print("IMDB Dataset loaded successfully.")
    # For performance, we'll work with a smaller sample of the data.
    # Remove this line to train on the full 50k reviews (will take longer).
    df = df.sample(n=10000, random_state=42)
    print(f"Using a sample of {len(df)} reviews for training.")

except FileNotFoundError:
    print("Error: 'IMDB Dataset.csv' not found.")
    print("Please download the dataset and place it in the same directory as this script.")
    exit()

print("Dataset Head:")
print(df.head())
print("\n" + "="*50 + "\n")

--- Data Loading & Preparation ---
IMDB Dataset loaded successfully.
Using a sample of 10000 reviews for training.
Dataset Head:
                                                  review sentiment
33553  I really liked this Summerslam due to the look...  positive
9427   Not many television shows appeal to quite as m...  positive
199    The film quickly gets to a major chase scene w...  negative
12447  Jane Austen would definitely approve of this o...  positive
39489  Expectations were somewhat high for me when I ...  negative




### ***--- 2. Data Cleaning (Preprocessing) ---***

In [4]:
print("--- Applying Preprocessing ---")

def preprocess_text(text):
    """
    Cleans and preprocesses a single piece of text.
    - Removes HTML tags
    - Removes noise (punctuation, special characters)
    - Normalizes text (converts to lowercase)
    """
    # Remove HTML tags
    text = re.sub(r'<br\s*/?>', ' ', text) # Specifically handle <br> tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the 'review' column
df['cleaned_review'] = df['review'].apply(preprocess_text)
print("Preprocessing complete. 'cleaned_review' column added.")
print(df[['cleaned_review', 'sentiment']].head())
print("\n" + "="*50 + "\n")

--- Applying Preprocessing ---
Preprocessing complete. 'cleaned_review' column added.
                                          cleaned_review sentiment
33553  i really liked this summerslam due to the look...  positive
9427   not many television shows appeal to quite as m...  positive
199    the film quickly gets to a major chase scene w...  negative
12447  jane austen would definitely approve of this o...  positive
39489  expectations were somewhat high for me when i ...  negative




### ***--- 3. Feature Extraction (TF-IDF) & Data Splitting ---***


In [5]:
print("--- Feature Extraction & Data Splitting ---")

# Define features (X) and target (y)
X = df['cleaned_review']
y = df['sentiment']

# Initialize the TF-IDF Vectorizer
# - `stop_words='english'`: Removes common English words.
# - `max_features=5000`: Considers the top 5000 words to balance performance and accuracy.
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Fit the vectorizer on the training data and transform it
print("Fitting TF-IDF Vectorizer and transforming training data...")
X_train_tfidf = vectorizer.fit_transform(X_train)

# Only transform the test data using the already-fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)
print(f"Data split into {X_train_tfidf.shape[0]} training samples and {X_test_tfidf.shape[0]} testing samples.")
print("\n" + "="*50 + "\n")

--- Feature Extraction & Data Splitting ---
Fitting TF-IDF Vectorizer and transforming training data...
Data split into 8000 training samples and 2000 testing samples.




### ***--- 4. Model Training ---***


In [6]:
print("--- Model Training ---")

# We will use Logistic Regression as it's a strong and efficient baseline.
print("Training Logistic Regression model...")
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_tfidf, y_train)
print("Model training complete.")
print("\n" + "="*50 + "\n")

--- Model Training ---
Training Logistic Regression model...
Model training complete.




### ***--- 5. Model Evaluation ---***


In [7]:
print("--- Model Evaluation ---")
print("Evaluating model on the test set...")
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
print("\n" + "="*50 + "\n")

--- Model Evaluation ---
Evaluating model on the test set...
Accuracy: 0.864

Classification Report:
               precision    recall  f1-score   support

    negative       0.88      0.84      0.86       992
    positive       0.85      0.89      0.87      1008

    accuracy                           0.86      2000
   macro avg       0.87      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000





### ***--- 6. Interactive Sentiment Analysis ---***


In [None]:
print("--- Interactive Sentiment Analyzer ---")

def analyze_user_review():
    """
    Takes user input for a movie and review, and predicts the sentiment.
    """
    movie_name = input("Enter the movie name you want to review: ")
    user_review = input(f"Enter your review for '{movie_name}': ")

    if not user_review.strip():
        print("\nReview is empty. Please enter some text.")
        return

    # 1. Preprocess the user's review
    cleaned_review = preprocess_text(user_review)

    # 2. Vectorize the cleaned review using the same fitted vectorizer
    review_tfidf = vectorizer.transform([cleaned_review])

    # 3. Predict the sentiment using the trained model
    prediction = model.predict(review_tfidf)
    prediction_proba = model.predict_proba(review_tfidf)

    # 4. Display the result
    sentiment = prediction[0]
    confidence = prediction_proba[0].max() * 100

    print("\n" + "-"*20)
    print("  A N A L Y S I S   R E S U L T")
    print("-"*20)
    print(f"Movie: {movie_name}")
    print(f"Predicted Sentiment: {sentiment.upper()}")
    print(f"Confidence: {confidence:.2f}%")
    print("-"*20)

# Start the interactive loop
while True:
    analyze_user_review()
    another = input("\nDo you want to analyze another review? (yes/no): ").lower()
    if another != 'yes':
        print("Thank you for using the Sentiment Analyzer!")
        break
    print("\n")

--- Interactive Sentiment Analyzer ---


Enter the movie name you want to review:  Uggram
Enter your review for 'Uggram':  Gangster look film



--------------------
  A N A L Y S I S   R E S U L T
--------------------
Movie: Uggram
Predicted Sentiment: POSITIVE
Confidence: 58.58%
--------------------



Do you want to analyze another review? (yes/no):  yes




