Author: Thupakula.Subhash
Date: Novembwr 2025

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import numpy as np
import pandas as pd

In [19]:
netflix_dataset = pd.read_csv('/content/drive/MyDrive/combined_data_1.txt.zip',header=None,names=['Cust_id','Ratings'],usecols=[0,1])

In [20]:
netflix_dataset

Unnamed: 0,Cust_id,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0
...,...,...
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0


In [21]:
# Get a summary of the dataset's characteristics
movie_count = netflix_dataset['Ratings'].isnull().sum()
print(f"Number of movies in the dataset: {movie_count}")

customer_count = netflix_dataset['Cust_id'].nunique() - movie_count
print(f"Number of unique customers: {customer_count}")

rating_count = netflix_dataset['Cust_id'].count() - movie_count
print(f"Total number of ratings: {rating_count}")

Number of movies in the dataset: 4499
Number of unique customers: 470758
Total number of ratings: 24053764


In [22]:
movie_id = None
movie = []

for customer in netflix_dataset['Cust_id']:
  if ":" in customer:
    movie_id = int(customer.replace(":",""))
  movie.append(movie_id)

In [23]:
netflix_dataset['movie_id'] = movie

In [24]:
netflix_dataset

Unnamed: 0,Cust_id,Ratings,movie_id
0,1:,,1
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


In [25]:
netflix_dataset.dropna(inplace = True)

In [26]:
netflix_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24053764 entries, 1 to 24058262
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   Cust_id   object 
 1   Ratings   float64
 2   movie_id  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 734.1+ MB


In [27]:
netflix_dataset['Cust_id'] = netflix_dataset['Cust_id'].astype(int)

In [28]:
netflix_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24053764 entries, 1 to 24058262
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   Cust_id   int64  
 1   Ratings   float64
 2   movie_id  int64  
dtypes: float64(1), int64(2)
memory usage: 734.1 MB


In [29]:
#Based on movie review count
movie_review_count = netflix_dataset['movie_id'].value_counts()

In [30]:
movie_review_count

Unnamed: 0_level_0,count
movie_id,Unnamed: 1_level_1
1905,193941
2152,162597
3860,160454
4432,156183
571,154832
...,...
4294,44
915,43
3656,42
4338,39


In [31]:
bench_mark = round(movie_review_count.quantile(0.6))
print(f"Benchmark for the movie review counts : {bench_mark}")

Benchmark for the movie review counts : 908


In [32]:
#grouping the movie ids which has reviews less than benchmark
drop_movie_index = movie_review_count[movie_review_count<bench_mark].index

In [33]:
drop_movie_index

Index([1598, 1733, 1647, 4099, 1616, 1446,  263, 4259,  160, 1988,
       ...
       1858, 4035, 3693, 2805,  820, 4294,  915, 3656, 4338, 4362],
      dtype='int64', name='movie_id', length=2699)

In [34]:
#Based on customer review count
cust_review_count = netflix_dataset['Cust_id'].value_counts()

In [35]:
cust_review_count

Unnamed: 0_level_0,count
Cust_id,Unnamed: 1_level_1
305344,4467
387418,4422
2439493,4195
1664010,4019
2118461,3769
...,...
1300341,1
2550360,1
11848,1
930788,1


In [36]:
bench_mark_cust = round(cust_review_count.quantile(0.6))

In [37]:
#grouping the customer ids who has reviewed less than benchmark
drop_cust_index = cust_review_count[cust_review_count<bench_mark_cust].index
drop_cust_index

Index([2194851,  600295, 1739398, 1157368,  532108, 2157249,  256134,  640441,
       1272324, 1346990,
       ...
       1969065,  899932,  611596, 2147176,  811650, 1300341, 2550360,   11848,
        930788,  594210],
      dtype='int64', name='Cust_id', length=282042)

In [38]:
#Removing movie ids which has less reviews than benchmark from the dataset
netflix_dataset = netflix_dataset[~netflix_dataset['movie_id'].isin(drop_movie_index)]

In [39]:
#Removing customer ids who has reviewed less than benchmark from the dataset
netflix_dataset = netflix_dataset[~netflix_dataset['Cust_id'].isin(drop_cust_index)]

In [40]:
#final dataset for model building
netflix_dataset

Unnamed: 0,Cust_id,Ratings,movie_id
696,712664,5.0,3
697,1331154,4.0,3
698,2632461,3.0,3
699,44937,5.0,3
700,656399,4.0,3
...,...,...,...
24056842,1055714,5.0,4496
24056843,2643029,4.0,4496
24056844,267802,4.0,4496
24056845,1559566,3.0,4496


In [41]:
# The surprise library requires NumPy < 2.0. We will install the correct version.
# You must restart the runtime after this step for it to take effect.
!pip install "numpy<2"



In [42]:
!pip install scikit-surprise



In [43]:

# After restarting the runtime, we can import the necessary modules.
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

In [44]:
# Prepare the data for the Surprise library
reader = Reader()
data = Dataset.load_from_df(netflix_dataset[['movie_id','Cust_id','Ratings']][:100000],reader)

In [45]:
# Use the SVD algorithm
model = SVD()

In [46]:
# Perform 3-fold cross-validation to evaluate the model
cross_validate(model, data, measures=['RMSE'], cv=3)

{'test_rmse': array([1.01800537, 1.01693713, 1.02199666]),
 'fit_time': (1.7773478031158447, 1.6623337268829346, 4.009521484375),
 'test_time': (0.3449525833129883, 0.271167516708374, 0.7005815505981445)}

In [47]:
# First, load the movie titles data to map IDs to names.
movie_title = pd.read_csv('/content/drive/MyDrive/movie_titles.csv',encoding = 'ISO-8859-1',header=None,names=['Movie_id','Year','Name'],usecols=[0,1,2])

In [48]:
# Select an example user and find all movies they have rated.
user_rating = netflix_dataset[netflix_dataset['Cust_id']==1331154]

In [49]:
user_rating

Unnamed: 0,Cust_id,Ratings,movie_id
697,1331154,4.0,3
5178,1331154,4.0,8
31460,1331154,3.0,18
92840,1331154,4.0,30
224761,1331154,3.0,44
...,...,...,...
23439584,1331154,4.0,4389
23546489,1331154,2.0,4402
23649431,1331154,4.0,4432
23844441,1331154,3.0,4472


In [50]:
# Create a DataFrame of movies that this user has NOT rated.
user_1331154 = movie_title.copy()
user_1331154 = user_1331154[~user_1331154['Movie_id'].isin(drop_movie_index)]

In [51]:
# Predict the rating for each unrated movie
est = []
for x in user_1331154['Movie_id']:
  temp = model.predict(1331154,x).est
  est.append(temp)

In [52]:
user_1331154['Estimated'] = est

In [53]:
user_1331154

Unnamed: 0,Movie_id,Year,Name,Estimated
2,3,1997.0,Character,3.584172
4,5,2004.0,The Rise and Fall of ECW,3.584172
5,6,1997.0,Sick,3.584172
7,8,2004.0,What the #$*! Do We Know!?,3.584172
15,16,1996.0,Screamers,3.584172
...,...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.584172
17766,17767,2004.0,Fidel Castro: American Experience,3.584172
17767,17768,2000.0,Epoch,3.584172
17768,17769,2003.0,The Company,3.584172


In [54]:
user_1331154 = user_1331154.sort_values('Estimated',ascending=False)

In [55]:
user_1331154.head()

Unnamed: 0,Movie_id,Year,Name,Estimated
8814,8815,1992.0,Chaplin: The Movie,3.858519
13510,13511,1993.0,Much Ado About Nothing,3.818987
13874,13875,1982.0,Gilbert and Sullivan: The Mikado,3.797536
11042,11043,1970.0,Mary Tyler Moore: Season 1,3.785441
3997,3998,1988.0,Joseph Campbell and The Power of Myth,3.769712


In [56]:
from surprise import dump

# Save the trained model to a file
dump.dump('recommendation_model.pkl', algo=model)