# Movie Review Sentiment Analysis and Rating Prediction

In this homework, you will:
1. Load IMDB movie reviews dataset using Hugging Face datasets
2. Perform sentiment analysis
3. Build a ML model to predict movie ratings


In [1]:
# TODO: Install required packages
%pip install pandas numpy scikit-learn transformers torch datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# TODO: Import required libraries
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Add any other libraries you need

## Part 1: Load Dataset

Load the IMDB dataset using Hugging Face datasets library
from datasets import load_dataset


In [4]:
# TODO: Load the IMDB dataset
# Hint: Use load_dataset('imdb')
# Convert to pandas DataFrame for easier manipulation


ds = load_dataset("stanfordnlp/imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
train_df = pd.DataFrame(ds['train'])
test_df = pd.DataFrame(ds['test'])
unsupervised_df = pd.DataFrame(ds['unsupervised'])
print(train_df.head())
print(test_df.head())
print(unsupervised_df.head())
print(ds.column_names)

                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
                                                text  label
0  I love sci-fi and am willing to put up with a ...      0
1  Worth the entertainment value of a rental, esp...      0
2  its a totally average film with a few semi-alr...      0
3  STAR RATING: ***** Saturday Night **** Friday ...      0
4  First off let me say, If you haven't enjoyed a...      0
                                                text  label
0  This is just a precious little diamond. The pl...     -1
1  When I say this is my favourite film of all ti...     -1
2  I saw this movie because I am a huge fan of th...     -1
3  Being that the only foreign films I u

## Part 2: Data Preprocessing

Clean and prepare the text data

In [6]:
# TODO: Create a function to clean text
# 1. Remove HTML tags
# 2. Remove special characters
# 3. Convert to lowercase
# Hint: Use regular expressions

import re


# Function to clean text
def clean_text(text):
    """
    Cleans input text by:
    1. Removing HTML tags
    2. Removing special characters
    3. Converting to lowercase
    """
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text


train_df['cleaned_text'] = train_df['text'].apply(clean_text)
test_df['cleaned_text'] = test_df['text'].apply(clean_text)
unsupervised_df['cleaned_text'] = unsupervised_df['text'].apply(clean_text)

print(train_df[['text', 'cleaned_text']].head())

                                                text  \
0  I rented I AM CURIOUS-YELLOW from my video sto...   
1  "I Am Curious: Yellow" is a risible and preten...   
2  If only to avoid making this type of film in t...   
3  This film was probably inspired by Godard's Ma...   
4  Oh, brother...after hearing about this ridicul...   

                                        cleaned_text  
0  i rented i am curiousyellow from my video stor...  
1  i am curious yellow is a risible and pretentio...  
2  if only to avoid making this type of film in t...  
3  this film was probably inspired by godards mas...  
4  oh brotherafter hearing about this ridiculous ...  


## Part 3: Advanced Sentiment Analysis

Go beyond binary classification - use a pre-trained model to get continuous sentiment scores

In [7]:
# TODO: Implement advanced sentiment analysis
# 1. Load a pre-trained model (hint: try 'distilbert-base-uncased-finetuned-sst-2-english')
# 2. Create a function to get continuous sentiment scores
# 3. Apply it to your cleaned text data
# Note: Original dataset has binary labels, but we want continuous scores!



from transformers import pipeline

# Step 1: Load the pre-trained sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Step 2: Create a function to get continuous sentiment scores
def get_sentiment_score(text):
    """
    Predict the sentiment score of the input text.
    Returns a continuous score where:
    - Positive sentiment is the probability of being positive (between 0 and 1)
    """
    try:
        result = sentiment_analyzer(text)
        print(result)
        # Extract the score for positive sentiment
        score = result[0]['score'] if result[0]['label'] == 'POSITIVE' else 1 - result[0]['score']
        return score
    except Exception as e:
        print(f"Error processing text: {text}, Error: {e}")
        return None


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [8]:

# Step 3: Apply it to the cleaned text data
train_df['sentiment_score'] = train_df['cleaned_text'].apply(get_sentiment_score)
test_df['sentiment_score'] = test_df['cleaned_text'].apply(get_sentiment_score)
unsupervised_df['sentiment_score'] = unsupervised_df['cleaned_text'].apply(get_sentiment_score)

# Display the results
#print(train_df[['cleaned_text', 'sentiment_score']].head())
#print(test_df[['cleaned_text', 'sentiment_score']].head())
#print(unsupervised_df[['cleaned_text', 'sentiment_score']].head()


[{'label': 'NEGATIVE', 'score': 0.5946515798568726}]
[{'label': 'NEGATIVE', 'score': 0.9990304708480835}]
[{'label': 'NEGATIVE', 'score': 0.9992595314979553}]
[{'label': 'NEGATIVE', 'score': 0.9588378667831421}]
[{'label': 'NEGATIVE', 'score': 0.9992403984069824}]
[{'label': 'NEGATIVE', 'score': 0.9982150793075562}]
[{'label': 'NEGATIVE', 'score': 0.9995020627975464}]


Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[{'label': 'POSITIVE', 'score': 0.998703122138977}]
[{'label': 'POSITIVE', 'score': 0.9996254444122314}]
[{'label': 'POSITIVE', 'score': 0.9975330829620361}]
[{'label': 'POSITIVE', 'score': 0.9995325803756714}]
[{'label': 'POSITIVE', 'score': 0.977408766746521}]
[{'label': 'POSITIVE', 'score': 0.8552988171577454}]
[{'label': 'POSITIVE', 'score': 0.9754020571708679}]
[{'label': 'POSITIVE', 'score': 0.9904299378395081}]
[{'label': 'POSITIVE', 'score': 0.9968764781951904}]
[{'label': 'POSITIVE', 'score': 0.979852020740509}]
[{'label': 'NEGATIVE', 'score': 0.8988737463951111}]
[{'label': 'POSITIVE', 'score': 0.529977560043335}]
[{'label': 'POSITIVE', 'score': 0.9523510336875916}]
[{'label': 'POSITIVE', 'score': 0.7310872077941895}]
[{'label': 'POSITIVE', 'score': 0.9751507043838501}]
[{'label': 'NEGATIVE', 'score': 0.5131222009658813}]
[{'label': 'POSITIVE', 'score': 0.96720951795578}]
[{'label': 'POSITIVE', 'score': 0.999840

## Part 4: Feature Engineering

Create rich features for your model

In [None]:
# TODO: Create features
# 1. Use your continuous sentiment scores
# 2. Calculate text statistics:
#    - Length
#    - Word count
#    - Average word length
#    - Sentence count
# 3. Any other features you think might help!

In [None]:
import numpy as np

# Step 1: Calculate Text Statistics
def calculate_text_features(df, text_column):
    """
    Calculate additional text-based features for the given DataFrame.
    Features:
    - Length of the text (number of characters)
    - Word count (number of words)
    - Average word length
    - Sentence count
    """
    # Length of the text
    df['text_length'] = df[text_column].apply(len)

    # Word count
    df['word_count'] = df[text_column].apply(lambda x: len(x.split()))

    # Average word length
    df['avg_word_length'] = df['text_length'] / df['word_count']

    # Sentence count (split by '.','!','?')
    df['sentence_count'] = df[text_column].apply(lambda x: len([s for s in x.split('.') if s.strip()]) +
                                                 len([s for s in x.split('!') if s.strip()]) +
                                                 len([s for s in x.split('?') if s.strip()]) - 2)

    return df

# Step 2: Add Sentiment Scores and Apply Feature Calculation
def add_features_to_dataframes(train_df, test_df, unsupervised_df, text_column='cleaned_text'):
    """
    Add features (sentiment scores and text statistics) to the DataFrames.
    """
    # Adding text statistics
    train_df = calculate_text_features(train_df, text_column)
    test_df = calculate_text_features(test_df, text_column)
    unsupervised_df = calculate_text_features(unsupervised_df, text_column)

    # Return updated DataFrames
    return train_df, test_df, unsupervised_df

# Apply to all datasets
train_df, test_df, unsupervised_df = add_features_to_dataframes(train_df, test_df, unsupervised_df)

# Display the updated DataFrame
print(train_df.head())
print( test_df.head())
print( unsupervised_df.head())



KeyError: 'sentiment_score'

## Part 5: Multi-Class Rating Prediction

Instead of binary classification, predict a 5-star rating!

In [25]:
# TODO: Create target variable
# Convert binary labels to 5-star ratings using your features
# Hint: Use sentiment scores and other features to estimate star rating


def map_to_star_rating(sentiment_score):
    """
    Convert sentiment scores to a 5-star rating system:
    - 0.0 to 0.2 -> 1 star
    - 0.2 to 0.4 -> 2 stars
    - 0.4 to 0.6 -> 3 stars
    - 0.6 to 0.8 -> 4 stars
    - 0.8 to 1.0 -> 5 stars
    """
    if sentiment_score <= 0.2:
        return 1
    elif sentiment_score <= 0.4:
        return 2
    elif sentiment_score <= 0.6:
        return 3
    elif sentiment_score <= 0.8:
        return 4
    else:
        return 5

# Apply the function to your DataFrames
train_df['star_rating'] = train_df['sentiment_score'].apply(map_to_star_rating)
test_df['star_rating'] = test_df['sentiment_score'].apply(map_to_star_rating)
unsupervised_df['star_rating'] = unsupervised_df['sentiment_score'].apply(map_to_star_rating)

# Display the resulting DataFrame with the new target variable
print(train_df[['cleaned_text', 'sentiment_score', 'star_rating']].head())


                                        cleaned_text  sentence_count  \
0  i rented i am curiousyellow from my video stor...               1   
1  i am curious yellow is a risible and pretentio...               1   
2  if only to avoid making this type of film in t...               1   
3  this film was probably inspired by godards mas...               1   
4  oh brotherafter hearing about this ridiculous ...               1   

   star_rating  
0            5  
1            5  
2            5  
3            5  
4            5  


In [29]:
print(train_df.columns)


Index(['text', 'label', 'cleaned_text', 'text_length', 'word_count',
       'avg_word_length', 'sentence_count', 'star_rating'],
      dtype='object')


In [None]:
# TODO: Build and train your model
# 1. Split data into train and test sets
# 2. Choose a model suitable for multi-class classification
# 3. Train the model
# 4. Make predictions
# 5. Evaluate performance


In [28]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

# Step 1: Split data into train and test sets
# Assuming 'star_rating' is the target column
X = train_df[['sentiment_score', 'text_length', 'word_count', 'avg_word_length', 'sentence_count']].values
y = train_df['star_rating'].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Choose a model suitable for multi-class classification
# Using Random Forest as an example
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Step 3: Train the model
rf_model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = rf_model.predict(X_test)

# Step 5: Evaluate performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))


KeyError: "['sentiment_score'] not in index"

## Part 6: Analysis

Analyze your results and suggest improvements

In [None]:
# TODO: Create visualizations and analyze:
# 1. Confusion matrix for multi-class predictions
# 2. Feature importance
# 3. Error analysis
# 4. Suggest improvements

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Extract feature importances
importances = rf_model.feature_importances_
features = ['sentiment_score', 'text_length', 'word_count', 'avg_word_length', 'sentence_count']

# Visualize feature importance
plt.figure(figsize=(8, 6))
sns.barplot(x=importances, y=features, palette="viridis")
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Feature Importance')
plt.show()

In [None]:
# Create a DataFrame for errors
error_analysis_df = X_test.copy()
error_analysis_df['true_labels'] = y_test
error_analysis_df['predicted_labels'] = y_pred

# Filter misclassified examples
misclassified = error_analysis_df[error_analysis_df['true_labels'] != error_analysis_df['predicted_labels']]

# Show some misclassified examples
print("Misclassified Examples:")
print(misclassified.head(10))

# Analyze confusion trends
confusion_df = pd.DataFrame(cm, index=np.unique(y_test), columns=np.unique(y_test))
print("Confusion Trends:")
print(confusion_df)


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)