Movie recommendation system

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [None]:
#User ratings dataframe
ratings=pd.read_csv("./ml-100k/u.data",sep="\t",names=["user_id","movie_id","rating","timestamp"])
ratings.head()

In [None]:
#Movies dataframe
columns=["movie_id","title","release_date","video_release_date","imdb_url","unknown",
         "Action","Adventure","Animation","Children\'s","Comedy","Crime","Documentary",
         "Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
         "Thriller","War","Western"]
movies=pd.read_csv("./ml-100k/u.item",sep="|",names=columns,encoding='latin1')
print(movies.head())

In [None]:
#Users dataframe
users=pd.read_csv("./ml-100k/u.user",sep="|",names=["user_id","age","gender","occupation","zip_code"])
print(users.head())

In [None]:
print(ratings.info())

In [None]:
print(movies.info())

In [None]:
print(users.info())

In [None]:
ratings.isnull().sum()

In [None]:
users.isnull().sum()

In [None]:
movies.isnull().sum()

In [None]:
print(movies.loc[movies["release_date"].isnull()])

In [None]:
print(movies[movies["title"]=="unknown"])
movies=movies[movies["title"]!="unknown"]
print("Removed movie with unknown title, genre and release date")

In [None]:
movies.drop(columns=["video_release_date"], inplace=True)   #remove video release date as all are null

In [None]:
print(movies.isnull().sum())

In [None]:
movies["imdb_url"]=movies["imdb_url"].fillna(value="none")   #Fill the missing url with a value none
movies.isnull().sum()

In [None]:
movies.info()

In [None]:
movies["release_date"]=pd.to_datetime(movies["release_date"])
print(movies["release_date"].head())

In [None]:
movies.info()

In [None]:
ratings[~ratings["movie_id"].isin(movies["movie_id"])]  #ratings for the movie that was removed

In [None]:
ratings=ratings[ratings["movie_id"].isin(movies["movie_id"])]
ratings.loc[ratings["movie_id"]==267]   #Empty row means successful deletion of ratings for the deleted movie

In [None]:
ratings.drop('timestamp',axis=1,inplace=True)
ratings.head()

Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

In [None]:
# Ratings distribution
ratings['rating'].value_counts().sort_index().plot(kind='bar')
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()


In [None]:
#Ratings per user
ratings_per_user = ratings.groupby('user_id').size().sort_index()
ratings_per_user.plot(kind='hist', bins=50)
plt.title('Ratings per User')
plt.xlabel('Number of Ratings')
plt.show()

In [None]:
# Ratings per movie
ratings_per_movie = ratings.groupby('movie_id').size()
ratings_per_movie.plot(kind='hist', bins=50)
plt.title('Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.show()

In [None]:
# Basic statistics of ratings dataframe
print(ratings.isnull().sum())
print("*"*50)
print(ratings.info())
print("*"*50)
print(ratings.describe())

In [None]:
print("Exploring the movies dataframe")
genre_columns=movies.columns[4:]
print("Genre columns => ",genre_columns)

In [None]:
# Sum of movies in each genre (A movie can belong to several genre)
genre_counts = movies[genre_columns].sum().sort_values().plot(kind="bar")

In [None]:
# Load the ratings file
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

ratings.head()

In [None]:
ratings_per_movie = ratings.groupby('movie_id').size()

plt.figure(figsize=(10,6))
ratings_per_movie.plot(kind='hist', bins=50, color='skyblue', edgecolor='black')
plt.title('Ratings per Movie')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Movies')
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None, usecols=[0,1], names=['movie_id', 'title'])

# Join with ratings count
ratings_count = ratings.groupby('movie_id').size().reset_index(name='count')
top_movies = ratings_count.sort_values(by='count', ascending=False).head(10)

# Merge to get titles
top_movies = pd.merge(top_movies, movies, on='movie_id')
print(top_movies[['title', 'count']])


In [None]:
# Load ratings data
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [None]:
# Load movie data (only movie_id and title)
movies = pd.read_csv(
    'ml-100k/u.item',
    sep='|',
    encoding='latin-1',
    header=None,
    usecols=[0, 1],
    names=['movie_id', 'title']
)

In [None]:
# Merge ratings with movie titles
merged_df = pd.merge(ratings, movies, on='movie_id')

# Show first few rows
merged_df.head()

In [None]:
top_rated = merged_df['title'].value_counts().head(10)
print(top_rated)

In [None]:
top_10 = merged_df['title'].value_counts().head(10)

In [None]:
plt.figure(figsize=(10, 6))
top_10.plot(kind='barh', color='skyblue', edgecolor='black')
plt.xlabel('Number of Ratings')
plt.title('Top 10 Most Rated Movies')
plt.gca().invert_yaxis()  # Most rated at top
plt.tight_layout()
plt.show()

In [None]:
#Feature Engineering – Ratings per Month

# 1. Load ratings data
ratings = pd.read_csv(
    'ml-100k/u.data',
    sep='\t',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

# 2. Convert timestamp to datetime
ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')

# 3. Extract year-month (e.g., '2020-05')
ratings['year_month'] = ratings['datetime'].dt.to_period('M')

# 4. Count ratings per movie each month
monthly_counts = ratings.groupby(['movie_id', 'year_month']).size().reset_index(name='ratings_count')

# 5. Pivot to wide format (each movie, month as column)
monthly_pivot = monthly_counts.pivot(
    index='movie_id',
    columns='year_month',
    values='ratings_count'
).fillna(0)

# 6. (Optional) Add a feature: total ratings across all months
monthly_pivot['total_ratings'] = monthly_pivot.sum(axis=1)

# 7. Merge this back with movie metadata or your main DataFrame
movies = pd.read_csv(
    'ml-100k/u.item',
    sep='|',
    encoding='latin-1',
    header=None,
    usecols=[0,1],
    names=['movie_id', 'title']
)

feat = pd.merge(monthly_pivot.reset_index(), movies, on='movie_id')
print(feat.head())


In [None]:
#Line Plot of Movie Ratings Over Time
# (Assuming 'monthly_counts' DataFrame from earlier)
# Example: plot ratings count over time for a single movie_id
movie_id = 50  # change as needed
ts = monthly_counts[monthly_counts.movie_id == movie_id]
ts.set_index('year_month', inplace=True)

fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(ts.index.to_timestamp(), ts['ratings_count'], marker='o')

ax.set_title(f'Monthly Ratings Count for Movie {movie_id}')
ax.set_xlabel('Month')
ax.set_ylabel('Number of Ratings')

# Format x-axis ticks every 3 months
locator = mdates.MonthLocator(interval=3)
formatter = mdates.DateFormatter('%Y-%m')
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
fig.autofmt_xdate()

ax.grid(True)
plt.show()


In [None]:
#Heatmap of Ratings by Movie and Time
# Pivot into matrix: movies x months
pivot = monthly_counts.pivot(
    index='movie_id',
    columns='year_month',
    values='ratings_count'
).fillna(0)

# (Optionally select top-10 most active movies for clarity)
top_movies = pivot.sum(axis=1).sort_values(ascending=False).head(10).index
pivot_top = pivot.loc[top_movies]

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_top, cmap='YlGnBu', cbar_kws={'label': 'Ratings Count'})
plt.title('Ratings Count per Month (Top 10 Movies)')
plt.xlabel('Year-Month')
plt.ylabel('Movie ID')
plt.show()


In [None]:
#Detecting Abnormal Movie Rating Patterns Using Isolation Forest.
from sklearn.ensemble import IsolationForest

# Build a feature df
features = merged_df.groupby('movie_id').agg({
    'rating': ['count', 'mean']
}).reset_index()
features.columns = ['movie_id', 'rating_count', 'avg_rating']

model = IsolationForest(contamination=0.02, random_state=42)
model.fit(features[['rating_count', 'avg_rating']])

features['anomaly'] = model.predict(features[['rating_count', 'avg_rating']])
outliers_iso = features[features['anomaly'] == -1]
print("IsolationForest outliers:", outliers_iso)

# Plot outliers
plt.figure(figsize=(8,6))
plt.scatter(features['rating_count'], features['avg_rating'], c=(features['anomaly']==-1), cmap='coolwarm', alpha=0.7)
plt.xlabel('Rating Count')
plt.ylabel('Average Rating')
plt.title('Outliers via Isolation Forest')
plt.show()


In [None]:
### Simple Item-Based Collaborative Filtering (Slope One)
from collections import defaultdict

# Load ratings
df = pd.read_csv('ml-100k/u.data', sep='\t',
                 names=['user', 'item', 'rating', 'ts'])

# Build user→item ratings dict
user_ratings = defaultdict(dict)
for row in df.itertuples():
    user_ratings[row.user][row.item] = row.rating

# Compute deviations and counts
dev, count = defaultdict(lambda: defaultdict(int)), defaultdict(lambda: defaultdict(int))
for u, items in user_ratings.items():
    for i, ri in items.items():
        for j, rj in items.items():
            if i != j:
                dev[i][j] += ri - rj
                count[i][j] += 1

# Average deviations
for i in dev:
    for j in dev[i]:
        dev[i][j] /= count[i][j]

# Predict function
def slope_one_predict(user, item):
    num, den = 0.0, 0
    for j, ruj in user_ratings[user].items():
        if item in dev[j]:
            num += (dev[j][item] + ruj) * count[j][item]
            den += count[j][item]
    return num / den if den else None

# Example: Predict ratings for a sample user
test_user = df.user.unique()[0]
preds = {item: slope_one_predict(test_user, item)
         for item in df.item.unique()
         if item not in user_ratings[test_user]}
top_preds = sorted(preds.items(), key=lambda x: x[1] or 0, reverse=True)[:10]

print("Top 10 recommendations (item: predicted rating):", top_preds)



In [66]:
### Matrix Factorization with Truncated SVD (NumPy & Scikit‑learn)
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load data
df = pd.read_csv('ml-100k/u.data', sep='\t',
                 names=['user', 'item', 'rating', 'ts'])

# Split train/test
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Pivot to matrix
train_mat = train.pivot(index='user', columns='item', values='rating').fillna(0)
test_mat = test.pivot(index='user', columns='item', values='rating').fillna(0)
train_mat.head()


item,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1671,1672,1673,1676,1678,1679,1680,1681
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.0,4.0,0.0,3.0,0.0,4.0,0.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
# Factorize with truncated SVD
U, Sigma, VT = randomized_svd(train_mat.values, n_components=20, random_state=42)
Sigma_mat = np.diag(Sigma)

# Reconstruct train prediction matrix
pred_train = np.dot(np.dot(U, Sigma_mat), VT)
train_mat.head()
test_mat.head()
train_mat.shape
test_mat.shape

(940, 1411)

In [None]:
# Perform truncated SVD to reconstruct user–item ratings matrix (low-rank approximation)

# from sklearn.decomposition import TruncatedSVD
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# Load data
df = pd.read_csv('ml-100k/u.data', sep='\t',
                 names=['user', 'item', 'rating', 'ts'])

# Split
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Collect all users/items
users = sorted(set(df['user']))
items = sorted(set(df['item']))

# Pivot training matrix
train_mat = train.pivot_table(index='user', columns='item',
                              values='rating', fill_value=0).reindex(
    index=users, columns=items, fill_value=0
)

# Apply TruncatedSVD
tsvd = TruncatedSVD(n_components=20, random_state=42)
train_reduced = tsvd.fit_transform(train_mat)

# Reconstruct full matrix
pred_matrix = np.dot(train_reduced, tsvd.components_)

# Wrap into DataFrame
pred_df = pd.DataFrame(pred_matrix, index=users, columns=items)

# Evaluate on test
preds, truths = [], []
for u, i, true_val in test[['user', 'item', 'rating']].itertuples(index=False):
    preds.append(pred_df.at[u, i])
    truths.append(true_val)

rmse = np.sqrt(mean_squared_error(truths, preds))
mae = mean_absolute_error(truths, preds)
print(f"SVD RMSE = {rmse:.3f}, MAE = {mae:.3f}")
