In [3]:
# Collaborative Filtering- makes recommendations based on the idea that users who agreed in the past will agree in the future.

# It can be user-based (finding similar users) or item-based (finding similar items). This notebook uses item-based collaborative filtering to find movies similar to a target movie based on user ratings.

#The goal of this notebook is to create a collaborative filtering-based recommender system using movie rating data. 
import pandas
user=pandas.read_csv('users.csv')
user


Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742
...,...,...,...,...
99998,880,476,3,880175444
99999,716,204,5,879795543
100000,276,1090,1,874795795
100001,13,225,2,882399156


In [4]:
user.isnull().sum()

user_id      0
item_id      0
rating       0
timestamp    0
dtype: int64

In [5]:
movies=pandas.read_csv('Movie_Id_Titles.csv')
movies

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [6]:
movies.isnull().sum()

item_id    0
title      0
dtype: int64

In [7]:
#Merge the two dataframe/tables
data=pandas.merge(user,movies, on='item_id')
data

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,0,172,5,881250949,"Empire Strikes Back, The (1980)"
2,0,133,1,881250949,Gone with the Wind (1939)
3,196,242,3,881250949,Kolya (1996)
4,186,302,3,891717742,L.A. Confidential (1997)
...,...,...,...,...,...
99998,880,476,3,880175444,"First Wives Club, The (1996)"
99999,716,204,5,879795543,Back to the Future (1985)
100000,276,1090,1,874795795,Sliver (1993)
100001,13,225,2,882399156,101 Dalmatians (1996)


In [18]:
# Group by movie title and count the number of ratings for each movie
num_of_ratings = data.groupby('title')['rating'].count()
# Display the number of ratings for each movie
print(num_of_ratings.head())


title
'Til There Was You (1997)      9
1-900 (1994)                   5
101 Dalmatians (1996)        109
12 Angry Men (1957)          125
187 (1997)                    41
Name: rating, dtype: int64


In [9]:
# Find average rating per movie.Displays the top movies with the highest average ratings.
# Calculate the average rating for each movie
#This step is crucial for identifying top-rated movies and understanding user preferences.
data.groupby('title')['rating'].mean().sort_values(ascending=False).head(20)
#top 10

title
They Made Me a Criminal (1939)                            5.000000
Marlene Dietrich: Shadow and Light (1996)                 5.000000
Saint of Fort Washington, The (1993)                      5.000000
Someone Else's America (1995)                             5.000000
Star Kid (1997)                                           5.000000
Great Day in Harlem, A (1994)                             5.000000
Aiqing wansui (1994)                                      5.000000
Santa with Muscles (1996)                                 5.000000
Prefontaine (1997)                                        5.000000
Entertaining Angels: The Dorothy Day Story (1996)         5.000000
Pather Panchali (1955)                                    4.625000
Some Mother's Son (1996)                                  4.500000
Maya Lin: A Strong Clear Vision (1994)                    4.500000
Anna (1996)                                               4.500000
Everest (1998)                                          

In [10]:
data.groupby('title')['rating'].mean().sort_values(ascending=False).tail(20)
#bottom 10

title
Bird of Prey (1996)                                   1.0
Office Killer (1997)                                  1.0
Lashou shentan (1992)                                 1.0
August (1996)                                         1.0
Venice/Venice (1992)                                  1.0
Death in the Garden (Mort en ce jardin, La) (1956)    1.0
Careful (1992)                                        1.0
Tigrero: A Film That Was Never Made (1994)            1.0
Butterfly Kiss (1995)                                 1.0
Low Life, The (1994)                                  1.0
To Cross the Rubicon (1991)                           1.0
Modern Affair, A (1995)                               1.0
Boys in Venice (1996)                                 1.0
Hedd Wyn (1992)                                       1.0
Wend Kuuni (God's Gift) (1982)                        1.0
Eye of Vichy, The (Oeil de Vichy, L') (1993)          1.0
King of New York (1990)                               1.0
Touki Bo

In [11]:
#Show the number of ratings per movie
#The goal of the code is to create a DataFrame that shows the average rating for each movie along with the number of ratings each movie has received.
#We are creating a new table that consists of title, average rationg and the number of ratings(how many times was it rated) 
no_of_rating=pandas.DataFrame(data.groupby('title')['rating'].mean())
no_of_rating['NumberOfRatings']=pandas.DataFrame(data.groupby('title')['rating'].count())
no_of_rating.head(29)


Unnamed: 0_level_0,rating,NumberOfRatings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.6,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344,125
187 (1997),3.02439,41
2 Days in the Valley (1996),3.225806,93
"20,000 Leagues Under the Sea (1954)",3.5,72
2001: A Space Odyssey (1968),3.969112,259
3 Ninjas: High Noon At Mega Mountain (1998),1.0,5
"39 Steps, The (1935)",4.050847,59


In [12]:
#A pivot table is a powerful data summarization tool that is used in data processing to organize, group, and aggregate data.
#Which movie did each user rate and what was their rating5?
each_user_rating=data.pivot_table(index='user_id', columns='title' , values='rating')
each_user_rating.head(20)

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,
6,,,,4.0,,,,5.0,,,...,,,,4.0,,,,,,
7,,,,4.0,,,5.0,5.0,,4.0,...,,,,5.0,3.0,,3.0,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,4.0,...,,,,,,,,,,


In [13]:
#Lets correlate the movies to one to find similar movies based on the ratings
#Finding movie correlations in the context of a recommender system helps us identify how similar the rating patterns of different movies are. 
selected_movie=each_user_rating['Godfather, The (1972)']
similar=each_user_rating.corrwith(selected_movie)
same_movies=pandas.DataFrame(similar,columns=['correlations'])
same_movies.sort_values('correlations',ascending=False).head(50)
#A correlation value of 1 means there is a perfect positive correlation between the ratings of two movies.
#A correlation value of 0 means there is no correlation between the ratings of two movies. The ratings of one movie do not predict the ratings of the other movie. This suggests no particular relationship in terms of user preferences.
#A correlation value of -1 means there is a perfect negative correlation between the ratings of two movies. When users rate one movie highly, they rate the other movie poorly, and vice versa.


  c /= stddev[:, None]
  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[None, :]


Unnamed: 0_level_0,correlations
title,Unnamed: 1_level_1
Dark City (1998),1.0
8 Seconds (1994),1.0
Talking About Sex (1994),1.0
"Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)",1.0
"Bye Bye, Love (1995)",1.0
"Locusts, The (1997)",1.0
Kicked in the Head (1997),1.0
"Outlaw, The (1943)",1.0
"Beans of Egypt, Maine, The (1994)",1.0
"Last Time I Saw Paris, The (1954)",1.0


In [14]:
#Lets add another column to show the number of ratings per movie in the dataframe above
same_movies=same_movies.join(no_of_rating['NumberOfRatings'])
same_movies.head(20)

Unnamed: 0_level_0,correlations,NumberOfRatings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),0.612372,9
1-900 (1994),-0.471405,5
101 Dalmatians (1996),0.084183,109
12 Angry Men (1957),0.034258,125
187 (1997),0.467335,41
2 Days in the Valley (1996),0.229871,93
"20,000 Leagues Under the Sea (1954)",0.095229,72
2001: A Space Odyssey (1968),0.305717,259
3 Ninjas: High Noon At Mega Mountain (1998),,5
"39 Steps, The (1935)",0.006306,59


In [20]:
#We group those with 200 ratings and above
same_movies[same_movies['NumberOfRatings']>200].sort_values('correlations',ascending=False).head(10)



Unnamed: 0_level_0,correlations,NumberOfRatings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",1.0,413
"Godfather: Part II, The (1974)",0.683862,209
GoodFellas (1990),0.421477,226
"People vs. Larry Flynt, The (1996)",0.393439,215
Apocalypse Now (1979),0.374378,221
Dead Man Walking (1995),0.360525,299
Psycho (1960),0.336903,239
Field of Dreams (1989),0.309903,212
2001: A Space Odyssey (1968),0.305717,259
Amadeus (1984),0.300868,276
