> #### Performing Data Preparation

In [7]:
import pandas as pd
pd.set_option("display.max_columns", 20)
pd.pandas.set_option('display.width', 300)

In [11]:
#movie = pd.open("C:/Users/ashua/Desktop/Movie/movies.csv")
movie = pd.read_csv("C:/Users/ashua/Desktop/Movie/movies.csv")
movie.shape

(62423, 3)

In [12]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
rating = pd.read_csv("C:/Users/ashua/Desktop/Movie/ratings.csv")
rating.shape

(25000095, 4)

In [16]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


Concatenating the 2 datasets

In [17]:
df = movie.merge(rating, how="left", on="movieId")
df.shape

(25003471, 6)

In [18]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,3.5,1141416000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1439472000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,3.0,1573944000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,858625900.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,890492500.0


In [19]:
df["title"].nunique()

62325

In [20]:
df["title"].value_counts().head()

Forrest Gump (1994)                 81491
Shawshank Redemption, The (1994)    81482
Pulp Fiction (1994)                 79672
Silence of the Lambs, The (1991)    74127
Matrix, The (1999)                  72674
Name: title, dtype: int64

In [21]:
comment_counts = pd.DataFrame(df["title"].value_counts())
rare_movies = comment_counts[comment_counts["title"] <= 1000].index
common_movies = df[~df["title"].isin(rare_movies)] 
common_movies.shape

(22138587, 6)

In [22]:
common_movies.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,3.5,1141416000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1439472000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,3.0,1573944000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,858625900.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,890492500.0


In [23]:
common_movies["title"].nunique()

3790

In [24]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
user_movie_df.shape

(162539, 3790)

In [25]:
user_movie_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,2.0,...,,4.0,3.5,,,,,3.5,,
4.0,,,,,,,,,,,...,,,4.0,4.5,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,


In [26]:
user_movie_df.columns
len(user_movie_df.columns)

3790

In [27]:
common_movies["title"].nunique()

3790

> #### Determining the movies watched by the user to be recommended.

Selecting a random user

In [28]:
random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).values)
random_user_df = user_movie_df[user_movie_df.index == random_user]
random_user_df.shape

(1, 3790)

In [29]:
#Let's look at the movies watched by the user we have chosen:

movies_watched = random_user_df.columns[random_user_df.notna().any()].to_list()

In [30]:
len(movies_watched)

45

In [31]:
user_movie_df.loc[user_movie_df.index == random_user, user_movie_df.columns =="Jurassic Park (1993)"]

title,Jurassic Park (1993)
userId,Unnamed: 1_level_1
120549.0,


> #### Access data and IDs of other users watching the same movies.

In [32]:
pd.set_option("display.max_columns", 5)
movies_watched_df = user_movie_df[movies_watched]
movies_watched_df.shape

(162539, 45)

In [33]:
movies_watched_df.head()

title,"6th Day, The (2000)",About Schmidt (2002),...,What Dreams May Come (1998),White Christmas (1954)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,,,...,,
2.0,,,...,,
3.0,3.5,4.0,...,,
4.0,,,...,,
5.0,,,...,,


In [34]:
user_movie_count = movies_watched_df.T.notnull().sum()

In [35]:
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]
user_movie_count.shape

(162539, 2)

In [36]:
user_movie_count.head()

Unnamed: 0,userId,movie_count
0,1.0,4
1,2.0,16
2,3.0,25
3,4.0,8
4,5.0,4


In [37]:
user_movie_count[user_movie_count["movie_count"] > 20].sort_values("movie_count", ascending=False)
user_movie_count[user_movie_count["movie_count"] == len(movies_watched)].count()

userId         6
movie_count    6
dtype: int64

> #### Identify the users who are most similar to the user to be suggested.

In [38]:
perc = len(movies_watched) * 60 / 100
perc

27.0

In [39]:
#person ids who watched 60% of the same movie as the user
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
users_same_movies.count()

2812

In [40]:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies.index)],random_user_df[movies_watched]])
final_df.shape

(2813, 45)

In [41]:
final_df.T.corr()


userId,170.0,225.0,...,162513.0,120549.0
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
170.0,,,...,,
225.0,,,...,,
425.0,,,...,,
430.0,,,...,,
547.0,,,...,,-0.183756
...,...,...,...,...,...
162268.0,,,...,0.411765,0.326989
162384.0,,,...,,
162492.0,,,...,-1.000000,-0.218218
162513.0,,,...,1.000000,0.095050


In [42]:
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ["user_id_1", "user_id_2"]
corr_df = corr_df.reset_index()

In [43]:
# Let's look at those that have a 65% correlation with the user:
top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.65)][["user_id_2", "corr"]].reset_index(drop=True)

# Let's take a look at the ones with the least correlation with #user:
top_users = top_users.sort_values(by="corr", ascending=False)

top_users.rename(columns={"user_id_2":"userId"}, inplace=True)
top_users.shape

(16, 2)

In [44]:
top_users.head()

Unnamed: 0,userId,corr
15,132648.0,0.925057
14,72853.0,0.924678
13,114331.0,0.904875
12,51922.0,0.881917
11,71574.0,0.801784


In [45]:
# rating = pd.read_csv("rating.csv")
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how="inner")
top_users_ratings = top_users_ratings[top_users_ratings["userId"] != random_user]
top_users_ratings["userId"].unique()

array([132648.,  72853., 114331.,  51922.,  71574.,  46447., 100361.,
        63768.,  89897.,  14398., 136631.,  18202.,  79159.,  36615.,
       121470.,  16550.])

> #### Calculating Weighted Average Recommendation Score and keeping the first 5 movies.

In [46]:
top_users_ratings["weighted_rating"] = top_users_ratings["corr"] * top_users_ratings["rating"]
top_users_ratings.head()

Unnamed: 0,userId,corr,movieId,rating,weighted_rating
0,132648.0,0.925057,1,4.5,4.162757
1,132648.0,0.925057,6,3.5,3.2377
2,132648.0,0.925057,16,4.5,4.162757
3,132648.0,0.925057,17,4.5,4.162757
4,132648.0,0.925057,36,4.5,4.162757


In [47]:
top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"}) #singularization by movies

Unnamed: 0_level_0,weighted_rating
movieId,Unnamed: 1_level_1
1,2.615046
2,1.580279
3,2.800266
6,2.754260
7,0.881917
...,...
189333,3.167062
191695,3.619499
191997,3.619499
194448,3.167062


In [48]:
recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()
recommendation_df.head()

Unnamed: 0,movieId,weighted_rating
0,1,2.615046
1,2,1.580279
2,3,2.800266
3,6,2.75426
4,7,0.881917


In [49]:
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.5].sort_values("weighted_rating", ascending=False)

> #### Making an Item-based suggestion based on the name of the movie that the user has watched with the highest score.

> 5 recommendations user-based. <br>
5 suggestions item-based. <br>
Make 10 suggestions.

In [50]:
movies_to_be_recommend.merge(movie[["movieId", "title"]])["title"].head()

0                               Labyrinth (1986)
1                      Enemy of the State (1998)
2    My Life as a Dog (Mitt liv som hund) (1985)
3                            Citizen Kane (1941)
4                                   Dumbo (1941)
Name: title, dtype: object

In [51]:
'''
user = 28941
movie_id = rating[(rating["userId"] == user) & (rating["rating"] == 5.0)].sort_values(by="timestamp", ascending = False)["movieId"][0:1].values[0]
'''

'\nuser = 28941\nmovie_id = rating[(rating["userId"] == user) & (rating["rating"] == 5.0)].sort_values(by="timestamp", ascending = False)["movieId"][0:1].values[0]\n'

In [52]:
'''
movie_name = movie[movie["movieId"] == movie_id]["title"].values[0]
movie_name = user_movie_df[movie_name]
movies_from_item_based = user_movie_df.corrwith(movie_name).sort_values(ascending=False)
movies_from_item_based[1:6].index
'''

'\nmovie_name = movie[movie["movieId"] == movie_id]["title"].values[0]\nmovie_name = user_movie_df[movie_name]\nmovies_from_item_based = user_movie_df.corrwith(movie_name).sort_values(ascending=False)\nmovies_from_item_based[1:6].index\n'