# ðŸŽ¬ Movie-Level Sentiment Pipeline (TextBlob) â€” Colab Ready

**What this notebook does (one-click "Run all"):**
1. Upload your `IMDB Dataset.csv` file (should contain `review` and `sentiment` columns).
2. Assign **15 real movie titles** randomly to reviews.
3. Clean text and compute **TextBlob** polarity for each review.
4. Aggregate to **movie-level** average sentiment and review counts.
5. Save `movie_level_sentiment.csv`, `top_positive_movies.png`, `top_negative_movies.png`, and `sentiment_summary.txt`.
6. Bundle outputs into `movie_outputs.zip` for download.

> Open this file in **Google Colab**, upload your CSV when prompted, then choose **Runtime â†’ Run all**.


In [None]:
# Install required packages (Colab will run this cell)
!pip install -q textblob pandas numpy matplotlib seaborn beautifulsoup4
# Download corpora needed by TextBlob
python -m textblob.download_corpora -q


In [None]:
# Step 1: Upload dataset
from google.colab import files
import pandas as pd
import io

print('Please upload your IMDB CSV file (e.g., IMDB Dataset.csv) when the dialog appears.')
uploaded = files.upload()
fname = list(uploaded.keys())[0]
print('âœ… Uploaded file:', fname)

df = pd.read_csv(io.BytesIO(uploaded[fname]))
print('\nColumns found in your file:', list(df.columns)[:20])
df.head()


In [None]:
# Step 2: Assign real movie titles (15 famous movies)
import numpy as np
np.random.seed(42)

movie_list = [
    "Inception", "Joker", "Avatar", "Titanic", "Interstellar", "Avengers: Endgame",
    "Parasite", "La La Land", "Frozen", "Gladiator", "Dune", "Barbie",
    "Oppenheimer", "The Lion King", "The Dark Knight"
]

# If movie_title already exists, keep it; otherwise assign randomly
if 'movie_title' not in df.columns:
    df['movie_title'] = np.random.choice(movie_list, size=len(df))
    print('Assigned', len(movie_list), 'movie titles randomly across the dataset.')
else:
    print('movie_title column found â€” using existing titles.')

df[['movie_title']].head()


In [None]:
# Step 3: Clean text (remove HTML, non-letters)
import re
from bs4 import BeautifulSoup

def clean_text(text):
    text = BeautifulSoup(str(text), "html.parser").get_text()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower().strip()
    return text

df['clean_review'] = df['review'].apply(clean_text)
df[['review','clean_review']].head()


In [None]:
# Step 4: Compute TextBlob polarity
from textblob import TextBlob
import tqdm

def polarity(text):
    try:
        return TextBlob(text).sentiment.polarity
    except Exception:
        return 0.0

# Use tqdm for progress in Colab
from tqdm import tqdm
tqdm.pandas()

df['tb_polarity'] = df['clean_review'].progress_apply(polarity)
df[['movie_title','tb_polarity']].head()


In [None]:
# Step 5: Aggregate to movie level
movie_sentiment = df.groupby('movie_title')['tb_polarity'].mean().reset_index()
movie_sentiment = movie_sentiment.rename(columns={'tb_polarity':'avg_sentiment'})
movie_sentiment['review_count'] = df.groupby('movie_title')['review'].count().values
movie_sentiment['sentiment_label'] = movie_sentiment['avg_sentiment'].apply(
    lambda x: 'Positive' if x > 0.2 else ('Negative' if x < -0.2 else 'Neutral')
)
movie_sentiment = movie_sentiment.sort_values('avg_sentiment', ascending=False).reset_index(drop=True)
movie_sentiment.head(15)


In [None]:
# Step 6: Save CSV
movie_sentiment.to_csv('movie_level_sentiment.csv', index=False)
print('âœ… Saved movie_level_sentiment.csv (one row per movie).')


In [None]:
# Step 7: Generate visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure at least 10 entries for plotting
top_positive = movie_sentiment.sort_values('avg_sentiment', ascending=False).head(10)
top_negative = movie_sentiment.sort_values('avg_sentiment', ascending=True).head(10)

plt.figure(figsize=(10,6))
sns.barplot(x='avg_sentiment', y='movie_title', data=top_positive)
plt.title('Top 10 Most Positive Movies (TextBlob polarity)')
plt.xlabel('Average Sentiment (polarity)')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.savefig('top_positive_movies.png')
plt.close()

plt.figure(figsize=(10,6))
sns.barplot(x='avg_sentiment', y='movie_title', data=top_negative)
plt.title('Top 10 Most Negative Movies (TextBlob polarity)')
plt.xlabel('Average Sentiment (polarity)')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.savefig('top_negative_movies.png')
plt.close()

print('âœ… Saved top_positive_movies.png and top_negative_movies.png')


In [None]:
# Step 8: Create a summary report (text file)
top_pos = top_positive.iloc[0]
top_neg = top_negative.iloc[0]

summary_lines = [
    "Movie Sentiment Summary\n",
    "-----------------------\n",
    f"Total movies analyzed: {len(movie_sentiment)}\n",
    f"Total reviews processed: {len(df)}\n",
    "\nTop 3 Positive Movies:\n"
]

for i, row in top_positive.head(3).iterrows():
    summary_lines.append(f"{i+1}. {row['movie_title']} â€” avg_sentiment: {row['avg_sentiment']:.4f} (reviews: {int(movie_sentiment.loc[movie_sentiment['movie_title']==row['movie_title'],'review_count'].values[0])})\n")

summary_lines.append('\nTop 3 Negative Movies:\n')
for i, row in top_negative.head(3).iterrows():
    summary_lines.append(f"{i+1}. {row['movie_title']} â€” avg_sentiment: {row['avg_sentiment']:.4f} (reviews: {int(movie_sentiment.loc[movie_sentiment['movie_title']==row['movie_title'],'review_count'].values[0])})\n")

summary_lines.append('\nOverall sentiment range:\n')
summary_lines.append(f"Max avg_sentiment: {movie_sentiment['avg_sentiment'].max():.4f}\n")
summary_lines.append(f"Min avg_sentiment: {movie_sentiment['avg_sentiment'].min():.4f}\n")

with open('sentiment_summary.txt','w') as f:
    f.writelines(summary_lines)

print('âœ… Saved sentiment_summary.txt') 


In [None]:
# Step 9: Zip all outputs for easy download
import shutil

files_to_zip = ['movie_level_sentiment.csv','top_positive_movies.png','top_negative_movies.png','sentiment_summary.txt']
shutil.make_archive('movie_outputs', 'zip', root_dir='.', base_dir='.')
# The above zips entire directory; we'll create a focused zip to be safe
import zipfile
with zipfile.ZipFile('movie_outputs.zip', 'w') as zf:
    for fname in files_to_zip:
        zf.write(fname)

print('âœ… Created movie_outputs.zip containing:', files_to_zip)
from google.colab import files
files.download('movie_outputs.zip')


## Notes & Next Steps

- If your original dataset includes `movie_title` or `genre`, you can skip the random assignment step by modifying the cell that assigns titles.
- Feel free to adjust the `movie_list` to any movies you prefer.
- For production-grade sentiment (and multi-language support), consider using transformer models (e.g., `distilbert-base-uncased-finetuned-sst-2-english`) â€” but these require more compute.

---  
Open the notebook in Colab, upload your CSV, then **Runtime â†’ Run all**. The ZIP will be downloaded automatically when the pipeline finishes.
