In [1]:
# Collaborative Filtering- makes recommendations based on the idea that users who agreed in the past will agree in the future.

# It can be user-based (finding similar users) or item-based (finding similar items). This notebook uses item-based collaborative filtering to find movies similar to a target movie based on user ratings.

#The goal of this notebook is to create a collaborative filtering-based recommender system using movie rating data. 
import pandas
user_item_details = pandas.read_csv("https://modcom.co.ke/datasets/file.csv")
user_item_details.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949


In [2]:
movie_details = pandas.read_csv("https://modcom.co.ke/datasets/Movie_Id_Titles.csv")
movie_details.head(2)

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)


In [3]:
# merge the two datasets
data = pandas.merge(user_item_details, movie_details, on = 'item_id')
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,0,172,5,881250949,"Empire Strikes Back, The (1980)"
2,0,133,1,881250949,Gone with the Wind (1939)
3,196,242,3,881250949,Kolya (1996)
4,186,302,3,891717742,L.A. Confidential (1997)


In [4]:
# Find average rating per movie.Displays the top movies with the highest average ratings.
# Calculate the average rating for each movie
#This step is crucial for identifying top-rated movies and understanding user preferences.
data.groupby('title')['rating'].mean().sort_values(ascending=False).head(5)

title
They Made Me a Criminal (1939)                5.0
Marlene Dietrich: Shadow and Light (1996)     5.0
Saint of Fort Washington, The (1993)          5.0
Someone Else's America (1995)                 5.0
Star Kid (1997)                               5.0
Name: rating, dtype: float64

In [5]:
# How many times was it rated
# Find average rating per movies
data.groupby('title')['rating'].count().sort_values(ascending=False).tail(5)

title
Great Day in Harlem, A (1994)         1
Other Voices, Other Rooms (1997)      1
Good Morning (1971)                   1
Girls Town (1996)                     1
Á köldum klaka (Cold Fever) (1994)    1
Name: rating, dtype: int64

In [6]:
#We want to show the  number of ratings for each movie.We show the first 5
#The goal of the code is to create a DataFrame that shows the average rating for each movie along with the number of ratings each movie has received. 
#We are creating a new table that consists of title, average rating and the number of ratings(how many times was it rated).  
mean_ratings = pandas.DataFrame(data.groupby('title')['rating'].mean())
mean_ratings['num_of_ratings'] = pandas.DataFrame(data.groupby('title')['rating'].count())
mean_ratings.head(5)


Unnamed: 0_level_0,rating,num_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.6,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344,125
187 (1997),3.02439,41


In [7]:
# Find which movie did each user rate, and what was the rating  -PIVOT TABLE
#A pivot table is a powerful data summarization tool that is used in data processing to organize, group, and aggregate data.
pivot   = data.pivot_table(index = 'user_id', columns = 'title', values = 'rating')
pivot.head(5)

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [14]:
# Lets correlate movies using the pivot
#Finding movie correlations in the context of a recommender system helps us identify how similar the rating patterns of different movies are. 
selected_movie = pivot['Godfather, The (1972)']

# find similar movies to this one based ratings

similar = pivot.corrwith(selected_movie)
similar_df  = pandas.DataFrame(similar, columns = ['Correlations'])
similar_df.sort_values('Correlations', ascending = False).head(50)
#A correlation value of 1 means there is a perfect positive correlation between the ratings of two movies.
#A correlation value of 0 means there is no correlation between the ratings of two movies. The ratings of one movie do not predict the ratings of the other movie. This suggests no particular relationship in terms of user preferences.
#A correlation value of -1 means there is a perfect negative correlation between the ratings of two movies. When users rate one movie highly, they rate the other movie poorly, and vice versa.


  c /= stddev[:, None]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[None, :]


Unnamed: 0_level_0,Correlations
title,Unnamed: 1_level_1
Dark City (1998),1.0
8 Seconds (1994),1.0
Talking About Sex (1994),1.0
"Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)",1.0
"Bye Bye, Love (1995)",1.0
"Locusts, The (1997)",1.0
Kicked in the Head (1997),1.0
"Outlaw, The (1943)",1.0
"Beans of Egypt, Maine, The (1994)",1.0
"Last Time I Saw Paris, The (1954)",1.0


In [9]:
# Show by movies that only had over 200 ratings
#Adding another column
similar_df = similar_df.join(mean_ratings['num_of_ratings'])
similar_df.head(5)

Unnamed: 0_level_0,Correlations,num_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),0.612372,9
1-900 (1994),-0.471405,5
101 Dalmatians (1996),0.084183,109
12 Angry Men (1957),0.034258,125
187 (1997),0.467335,41


In [10]:
# Take only the one with over 200 ratings
similar_df[similar_df['num_of_ratings'] > 200].sort_values('Correlations', ascending = False).head(20)

Unnamed: 0_level_0,Correlations,num_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",1.0,413
"Godfather: Part II, The (1974)",0.683862,209
GoodFellas (1990),0.421477,226
"People vs. Larry Flynt, The (1996)",0.393439,215
Apocalypse Now (1979),0.374378,221
Dead Man Walking (1995),0.360525,299
Psycho (1960),0.336903,239
Field of Dreams (1989),0.309903,212
2001: A Space Odyssey (1968),0.305717,259
Amadeus (1984),0.300868,276
