In [None]:
%pip install wordcloud

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from wordcloud import WordCloud
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import pickle
import re

# Task
Create a Python program that takes a movie review as input, preprocesses it using the previously saved `countVectorizer.pkl` and `scaler.pkl`, and predicts whether the review is positive or negative using the saved `model_xgb.pkl`.

## Load the saved models and vectorizer

### Subtask:
Load the saved `countVectorizer.pkl`, `scaler.pkl`, and `model_xgb.pkl` files from disk into Python objects.


**Reasoning**:
Load the saved CountVectorizer, scaler, and XGBoost model using pickle.



In [None]:
# Install required packages
!pip install flask-ngrok pyngrok -q

import re
import pickle
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from google.colab import files
import nltk

# Download NLTK stopwords
nltk.download('stopwords', quiet=True)

STOPWORDS = set(stopwords.words("english"))

# Step 1: Upload your model files
print("=" * 60)
print("UPLOAD YOUR MODEL FILES")
print("=" * 60)

print("\n1. Upload model_xgb.pkl:")
uploaded = files.upload()
model_filename = list(uploaded.keys())[0]
with open(model_filename, 'rb') as f:
    predictor = pickle.load(f)
print("✓ Model loaded!")

print("\n2. Upload scaler.pkl:")
uploaded = files.upload()
scaler_filename = list(uploaded.keys())[0]
with open(scaler_filename, 'rb') as f:
    scaler = pickle.load(f)
print("✓ Scaler loaded!")

print("\n3. Upload countVectorizer.pkl:")
uploaded = files.upload()
cv_filename = list(uploaded.keys())[0]
with open(cv_filename, 'rb') as f:
    cv = pickle.load(f)
print("✓ CountVectorizer loaded!")

print("\n" + "=" * 60)
print("ALL MODELS LOADED SUCCESSFULLY!")
print("=" * 60)


# Step 2: Define prediction function
def single_prediction(predictor, scaler, cv, text_input):
    """
    Predicts sentiment for a single text input
    """
    corpus = []
    stemmer = PorterStemmer()

    # Preprocess the text
    review = re.sub("[^a-zA-Z]", " ", text_input)
    review = review.lower().split()
    review = [stemmer.stem(word) for word in review if word not in STOPWORDS]
    review = " ".join(review)
    corpus.append(review)

    # Transform and predict
    X_prediction = cv.transform(corpus).toarray()
    X_prediction_scl = scaler.transform(X_prediction)
    y_predictions = predictor.predict_proba(X_prediction_scl)
    y_predictions = y_predictions.argmax(axis=1)[0]

    # Get probabilities
    proba = predictor.predict_proba(X_prediction_scl)[0]

    sentiment = "Positive" if y_predictions == 1 else "Negative"
    confidence = max(proba) * 100

    return sentiment, confidence, proba


# Step 3: Interactive prediction loop
print("\n" + "=" * 60)
print("PRODUCT REVIEW SENTIMENT ANALYZER")
print("=" * 60)
print("\nType your product review and press Enter to analyze.")
print("Type 'quit' or 'exit' to stop.")
print("Type 'bulk' to analyze a CSV file.\n")

while True:
    review = input("Enter your review: ").strip()

    if review.lower() in ['quit', 'exit', 'q']:
        print("\n✓ Thank you for using the sentiment analyzer!")
        break

    if review.lower() == 'bulk':
        print("\nUpload your CSV file (must have a 'Sentence' column):")
        uploaded = files.upload()
        csv_filename = list(uploaded.keys())[0]

        try:
            data = pd.read_csv(csv_filename)

            if 'Sentence' not in data.columns:
                print("✗ Error: CSV must have a 'Sentence' column")
                continue

            print(f"\n📊 Processing {len(data)} reviews...")

            corpus = []
            stemmer = PorterStemmer()

            for i in range(data.shape[0]):
                review_text = re.sub("[^a-zA-Z]", " ", str(data.iloc[i]["Sentence"]))
                review_text = review_text.lower().split()
                review_text = [stemmer.stem(word) for word in review_text if word not in STOPWORDS]
                review_text = " ".join(review_text)
                corpus.append(review_text)

            X_prediction = cv.transform(corpus).toarray()
            X_prediction_scl = scaler.transform(X_prediction)
            y_predictions = predictor.predict_proba(X_prediction_scl)
            y_predictions = y_predictions.argmax(axis=1)

            # Map predictions
            data["Predicted sentiment"] = ["Positive" if pred == 1 else "Negative" for pred in y_predictions]

            # Save results
            output_filename = "predictions_output.csv"
            data.to_csv(output_filename, index=False)

            # Show summary
            sentiment_counts = data["Predicted sentiment"].value_counts()
            print("\n✓ Prediction complete!")
            print(f"\nSummary:")
            for sentiment, count in sentiment_counts.items():
                percentage = (count / len(data)) * 100
                print(f"  {sentiment}: {count} ({percentage:.1f}%)")

            print(f"\n📥 Downloading results as '{output_filename}'...")
            files.download(output_filename)

            print("\nFirst 5 predictions:")
            print(data[["Sentence", "Predicted sentiment"]].head().to_string(index=False))
            print()

        except Exception as e:
            print(f"\n✗ Error processing file: {e}\n")

        continue

    if not review:
        print("⚠ Please enter a valid review.\n")
        continue

    # Get prediction
    try:
        sentiment, confidence, proba = single_prediction(predictor, scaler, cv, review)

        print(f"\n{'='*40}")
        print(f"✓ Sentiment: {sentiment}")
        print(f"  Confidence: {confidence:.2f}%")
        print(f"  [Negative: {proba[0]*100:.1f}% | Positive: {proba[1]*100:.1f}%]")
        print(f"{'='*40}\n")

    except Exception as e:
        print(f"\n✗ Error: {e}")
        print("Please check your input and try again.\n")

UPLOAD YOUR MODEL FILES

1. Upload model_xgb.pkl:


Saving model_xgb.pkl to model_xgb (4).pkl
✓ Model loaded!

2. Upload scaler.pkl:


configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

  predictor = pickle.load(f)


Saving scaler.pkl to scaler (2).pkl
✓ Scaler loaded!

3. Upload countVectorizer.pkl:


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Saving countVectorizer.pkl to countVectorizer (4).pkl
✓ CountVectorizer loaded!

ALL MODELS LOADED SUCCESSFULLY!

PRODUCT REVIEW SENTIMENT ANALYZER

Type your product review and press Enter to analyze.
Type 'quit' or 'exit' to stop.
Type 'bulk' to analyze a CSV file.

Enter your review: does not work all the time

✓ Sentiment: Negative
  Confidence: 53.96%
  [Negative: 54.0% | Positive: 46.0%]

Enter your review: does not work properly

✓ Sentiment: Positive
  Confidence: 83.19%
  [Negative: 16.8% | Positive: 83.2%]

Enter your review: bad

✓ Sentiment: Positive
  Confidence: 82.37%
  [Negative: 17.6% | Positive: 82.4%]

Enter your review: garbage

✓ Sentiment: Positive
  Confidence: 86.76%
  [Negative: 13.2% | Positive: 86.8%]

Enter your review: very bad product

✓ Sentiment: Positive
  Confidence: 84.23%
  [Negative: 15.8% | Positive: 84.2%]

Enter your review: Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as

KeyboardInterrupt: Interrupted by user