# Content-based filtering


In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date
from datetime import timedelta

In [2]:
movies = pd.read_csv("movies.csv")  # reading movie data set

In [3]:
rating_raw = pd.read_csv("ratings.csv") # reading ratings data set

In [4]:
rating_raw.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# creating a new column Date using the unix 'timestamp' column
rating = rating_raw.copy()
rating["Date"] = [datetime.fromtimestamp(convert).date() for convert in rating["timestamp"]]

In [6]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,Date
0,1,1,4.0,964982703,2000-07-31
1,1,3,4.0,964981247,2000-07-30
2,1,6,4.0,964982224,2000-07-31
3,1,47,5.0,964983815,2000-07-31
4,1,50,5.0,964982931,2000-07-31


**Creating a new column 'Days_Since_Seen' using the 'Date' column to determine number of days since the movie is watched**

In [7]:
recent_date = rating["Date"].max() + timedelta(days=1)  # Adding one day to the max day to find the difference
rating['Days_Since_Seen'] = rating['Date'].apply(lambda x: (recent_date - x).days)  # Creating a new column to check when they have watched that movie

In [8]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,Date,Days_Since_Seen
0,1,1,4.0,964982703,2000-07-31,6630
1,1,3,4.0,964981247,2000-07-30,6631
2,1,6,4.0,964982224,2000-07-31,6630
3,1,47,5.0,964983815,2000-07-31,6630
4,1,50,5.0,964982931,2000-07-31,6630


In [9]:
rating.describe()

Unnamed: 0,userId,movieId,rating,timestamp,Days_Since_Seen
count,100836.0,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0,3841.52515
std,182.618491,35530.987199,1.042529,216261000.0,2503.057027
min,1.0,1.0,0.5,828124600.0,1.0
25%,177.0,1199.0,3.0,1019124000.0,1179.0
50%,325.0,2991.0,3.5,1186087000.0,4071.0
75%,477.0,8122.0,4.0,1435994000.0,6004.0
max,610.0,193609.0,5.0,1537799000.0,8214.0


# Data Prepocessing

In [10]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


**Splitting the 'title' column and Idetifying the unique genres by using the concept of sets.**

In [11]:
genre_set = set()     # creating empty set to store unique genres
for i in movies["genres"]:   #
    split = i.split("|")
    genre_set.update(split)
print(genre_set)

{'Musical', 'War', 'Animation', 'Drama', 'Romance', 'Sci-Fi', 'Film-Noir', 'Western', 'Documentary', 'Action', 'Crime', 'Adventure', 'Comedy', 'IMAX', 'Fantasy', 'Children', 'Thriller', 'Mystery', '(no genres listed)', 'Horror'}


**Creating the columns for all the genres**

In [12]:
for genre in genre_set:
    movies[genre] = 0
movies.head()

Unnamed: 0,movieId,title,genres,Musical,War,Animation,Drama,Romance,Sci-Fi,Film-Noir,...,Crime,Adventure,Comedy,IMAX,Fantasy,Children,Thriller,Mystery,(no genres listed),Horror
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**If particular genre is present column value is assigned to 1**

In [13]:
for index, row in movies.iterrows():
    for current_genre in genre_set:
        if current_genre in row['genres']:
            movies.loc[index,current_genre] = 1

In [14]:
movies_enc = movies.iloc[:,3:]

In [15]:
movies_enc.head()

Unnamed: 0,Musical,War,Animation,Drama,Romance,Sci-Fi,Film-Noir,Western,Documentary,Action,Crime,Adventure,Comedy,IMAX,Fantasy,Children,Thriller,Mystery,(no genres listed),Horror
0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


## Creating Movie - Movie Similarity Matrix

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
cosine_mov = pd.DataFrame(cosine_similarity(movies_enc))  # By using cosine similarity created a matrix

In [18]:
cosine_mov.index = movies["title"]
cosine_mov.columns = movies["title"]

In [19]:
cosine_mov.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,0.774597,0.316228,0.258199,0.447214,0.0,0.316228,0.632456,0.0,0.258199,...,0.447214,0.316228,0.316228,0.447214,0.0,0.67082,0.774597,0.0,0.316228,0.447214
Jumanji (1995),0.774597,1.0,0.0,0.0,0.0,0.0,0.0,0.816497,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.288675,0.333333,0.0,0.0,0.0
Grumpier Old Men (1995),0.316228,0.0,1.0,0.816497,0.707107,0.0,1.0,0.0,0.0,0.0,...,0.353553,0.0,0.5,0.0,0.0,0.353553,0.408248,0.0,0.0,0.707107
Waiting to Exhale (1995),0.258199,0.0,0.816497,1.0,0.57735,0.0,0.816497,0.0,0.0,0.0,...,0.288675,0.408248,0.816497,0.0,0.0,0.288675,0.333333,0.57735,0.0,0.57735
Father of the Bride Part II (1995),0.447214,0.0,0.707107,0.57735,1.0,0.0,0.707107,0.0,0.0,0.0,...,0.5,0.0,0.707107,0.0,0.0,0.5,0.57735,0.0,0.0,1.0


## Creating a movie list with movie ID and title

In [20]:
movies_list = movies.iloc[:,0:2]

In [21]:
movies_list.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


**Creating a new column by idetifying the year from the title of the movie using split operation**

In [22]:
# creating a new column with the year of release
movies_list["Year"] = movies_list['title'].apply(lambda x: x.split()[-1][1:-1])

In [23]:
movies_list["Year"].unique()

array(['1995', '1994', '1996', '1976', '1992', '1967', '1993', '1964',
       '1977', '1965', '1982', '1990', '1991', '1989', '1937', '1940',
       '1969', '1981', '1973', '1970', '1955', '1959', '1968', '1988',
       '1997', '1972', '1943', '1952', '1951', '1957', '1961', '1958',
       '1954', '1934', '1944', '1960', '1963', '1942', '1941', '1953',
       '1939', '1950', '1946', '1945', '1938', '1947', '1935', '1936',
       '1956', '1949', '1932', '1975', '1974', '1971', '1979', '1987',
       '1986', '1980', '1978', '1985', '1966', '1962', '1983', '1984',
       '1948', '1933', '1931', '1922', '1998', '1929', '1930', '1927',
       '1928', '1999', '2000', '1926', '1919', '1921', '1925', '1923',
       '2001', '2002', '2003', '1920', '1915', '1924', '2004', '1916',
       '1917', '2005', '2006', '1902', '', '1903', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '1908', 'n', 'oa', 'atso', 'nimal', 'aterso', 'oonligh',
 

In [24]:
movies_list.loc[(movies_list["Year"] ==  ''),]

Unnamed: 0,movieId,title,Year
6059,40697,Babylon 5,
9448,167570,The OA,
9525,171891,Generation Iron 2,


**Identified few movies for which released year is not mentioned. Googled and determined the release years of those movies. Will replace it in the year column**


- Ready Player One : 2018
- Hyena Road : 2015
- The Adventures of Sherlock Holmes and Doctor : 1980
- Nocturnal Animals : 2016
- Paterson : 2016
- Moonlight	: 2016
- Cosmos : 2015
- Maria Bamford: Old Baby : 2017
- Death Note: Desu nôto (2006–2007) : 2006
- Black Mirror : 2011
- Babylon 5 : 1993
- The OA : 2016
- Generation Iron 2 : 2017

In [25]:
# Replacing the year for the missed columns
movies_list.at[9031,'Year'] = '2018'
movies_list.at[9091,'Year'] = '2015'
movies_list.at[9138,'Year'] = '1980'
movies_list.at[9179,'Year'] = '2016'
movies_list.at[9259,'Year'] = '2016'
movies_list.at[9367,'Year'] = '2016'
movies_list.at[9514,'Year'] = '2015'
movies_list.at[9515,'Year'] = '2017'
movies_list.at[9518,'Year'] = '2006'
movies_list.at[9611,'Year'] = '2011'
movies_list.at[6059,'Year'] = '1993'
movies_list.at[9448,'Year'] = '2016'
movies_list.at[9525,'Year'] = '2017'

**Obtained unique release years after updates**

In [26]:
movies_list["Year"].unique()

array(['1995', '1994', '1996', '1976', '1992', '1967', '1993', '1964',
       '1977', '1965', '1982', '1990', '1991', '1989', '1937', '1940',
       '1969', '1981', '1973', '1970', '1955', '1959', '1968', '1988',
       '1997', '1972', '1943', '1952', '1951', '1957', '1961', '1958',
       '1954', '1934', '1944', '1960', '1963', '1942', '1941', '1953',
       '1939', '1950', '1946', '1945', '1938', '1947', '1935', '1936',
       '1956', '1949', '1932', '1975', '1974', '1971', '1979', '1987',
       '1986', '1980', '1978', '1985', '1966', '1962', '1983', '1984',
       '1948', '1933', '1931', '1922', '1998', '1929', '1930', '1927',
       '1928', '1999', '2000', '1926', '1919', '1921', '1925', '1923',
       '2001', '2002', '2003', '1920', '1915', '1924', '2004', '1916',
       '1917', '2005', '2006', '1902', '1903', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '1908'], dtype=object)

In [27]:
movies_list.dtypes

movieId     int64
title      object
Year       object
dtype: object

In [28]:
# Converting Year from object data type to int64 data type to perform arthimetic operations
movies_list["Year"] = movies_list["Year"].astype('int64')

In [29]:
movies_list.dtypes

movieId     int64
title      object
Year        int64
dtype: object

In [30]:
movies_list.sample(5)

Unnamed: 0,movieId,title,Year
7360,78637,Shrek Forever After (a.k.a. Shrek: The Final C...,2010
6020,38198,Darwin's Nightmare (2004),2004
6699,58306,Mongol (2007),2007
7317,77233,"Union: The Business Behind Getting High, The (...",2007
367,423,Blown Away (1994),1994


## Top Movies

In [31]:
## Standardizing the rating column

In [32]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,Date,Days_Since_Seen
0,1,1,4.0,964982703,2000-07-31,6630
1,1,3,4.0,964981247,2000-07-30,6631
2,1,6,4.0,964982224,2000-07-31,6630
3,1,47,5.0,964983815,2000-07-31,6630
4,1,50,5.0,964982931,2000-07-31,6630


In [33]:
rating_std = rating.copy()

In [34]:
#Standardizing the raatings for treating easy and tough raters
for user_id in rating_std["userId"].unique():   
    mean = np.mean(rating_std.loc[(rating_std["userId"] == user_id),"rating"])
    std = np.std(rating_std.loc[(rating_std["userId"] == user_id),"rating"]) 
    rating_std.loc[(rating_std["userId"] == user_id),"std_rating"] = (rating_std.loc[(rating_std["userId"] == user_id),"rating"] - mean)/std

In [35]:
# Stadardizing rating data frame to handle tough and easy raters
#rating_std["rating"] = round((rating_std["rating"] - rating_std["rating"].mean())/rating_std["rating"].std(),2)

In [36]:
# Standardized data set which will be used to determine the top movies
rating_std.describe()

Unnamed: 0,userId,movieId,rating,timestamp,Days_Since_Seen,std_rating
count,100836.0,100836.0,100836.0,100836.0,100836.0,100816.0
mean,326.127564,19435.295718,3.501557,1205946000.0,3841.52515,7.554485e-18
std,182.618491,35530.987199,1.042529,216261000.0,2503.057027,1.000005
min,1.0,1.0,0.5,828124600.0,1.0,-5.905713
25%,177.0,1199.0,3.0,1019124000.0,1179.0,-0.6187236
50%,325.0,2991.0,3.5,1186087000.0,4071.0,0.1271893
75%,477.0,8122.0,4.0,1435994000.0,6004.0,0.6909125
max,610.0,193609.0,5.0,1537799000.0,8214.0,3.01232


In [37]:
rating_std.isnull().sum()

userId              0
movieId             0
rating              0
timestamp           0
Date                0
Days_Since_Seen     0
std_rating         20
dtype: int64

In [38]:
rating_std['std_rating'] = rating_std['std_rating'].fillna(rating_std['std_rating'].mean())

In [39]:
rating_std.isnull().sum()

userId             0
movieId            0
rating             0
timestamp          0
Date               0
Days_Since_Seen    0
std_rating         0
dtype: int64

In [40]:
#np.percentile(rating_std["rating"],75)

In [41]:
#rating.loc[(rating["rating"] == 4.5),]

In [42]:
#rating_std.loc[(rating_std["std_rating"] >= 0.96),]

In [43]:
#rating_std.loc[(rating_std["std_rating"] >= np.percentile(rating_std["std_rating"],90)),]

In [44]:
#rating.loc[(rating["rating"] > np.percentile(rating["rating"],75)),]

In [45]:
rating_std.describe()

Unnamed: 0,userId,movieId,rating,timestamp,Days_Since_Seen,std_rating
count,100836.0,100836.0,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0,3841.52515,7.552987e-18
std,182.618491,35530.987199,1.042529,216261000.0,2503.057027,0.9999058
min,1.0,1.0,0.5,828124600.0,1.0,-5.905713
25%,177.0,1199.0,3.0,1019124000.0,1179.0,-0.6187236
50%,325.0,2991.0,3.5,1186087000.0,4071.0,0.1271893
75%,477.0,8122.0,4.0,1435994000.0,6004.0,0.6909125
max,610.0,193609.0,5.0,1537799000.0,8214.0,3.01232


#### Determining top movies by adding filters

**Defining a function top_movies to obtaing the top n movies which are rated by the user.**

**Firstly wanted to know the max rating given by the user, but here I have decided to obtain the max rating only for the recently seen movies. For recency , here I have taken the 50th percentile of the days since.**

**And the first set of movies contains the movies which are rated 0.96 and more, where as second set contains movies with 0.48.**


In [46]:
def top_movies(user_id,top = 5):
    max_rating = rating_std.loc[(rating_std.userId == user_id) & (rating_std.Days_Since_Seen <= np.percentile(rating_std.Days_Since_Seen,25)),"std_rating"].max()   # finding the max rating of the movies which are recent as we wanted to suggest only recent movies
    if max_rating < np.percentile(rating_std["std_rating"],75):  # Checking if the max rating is in our range
        return(None)
    else:
        movie_set1 = rating_std.loc[(rating_std.userId == user_id) & (rating_std.rating >= np.percentile(rating_std["std_rating"],90)),"movieId"]  # This is first set of movies which are having rating more than 90 percentile
        movie_set2 = rating_std.loc[(rating_std.userId == user_id) & (rating_std.rating >= np.percentile(rating_std["std_rating"],75)) & (rating_std.rating < np.percentile(rating_std["std_rating"],90)),"movieId"] # This is the second set of movies which are having rating more than 75 and less than 90 
        
        if len(movie_set1) > top:    # If first set contains movies more than top selectig random movies
            return(list(np.random.choice(movie_set1,top)))
        
        elif (len(movie_set1) + len(movie_set2)) <= top:   # If movie set1 and set 2 is having the movies less than or equal to top movies, returning them
            return(list(movie_set1)+len(movie_set2))
        
        else:
            return(list(movie_set1) + list(np.random.choice(movie_set2,top - len(movie_set1))))

In [47]:
top_movies(111,10)

[73017, 99117, 31685, 5363, 6287, 56174, 5564, 93326, 152081, 7451]

## Finding Similar movies

- **Created a function for obtaining similar movies with parameters movie_id and similarity**

- **Initially determined the title of the movie and when it was released**

- **After obtaining the similar movies using cosine simililarity matrix, I've implemented some changes.**

- **From the obtained similar movies, I have futher shortlisted movies by only selecting the movies which are released only +5 or -5 years from the date of release. For example If the movie is release is 1990 then I have slected the movies similar movies which are only released in the time frame of (1985-1995)**

In [48]:
def get_similar_movies(movie_id,sim=0.9):   #
    
    # Fetching the title
    title = np.array(movies_list.loc[(movies_list.movieId == movie_id),'title'])[0]  # obtaining the title of the movieid
    year = np.array(movies_list.loc[(movies_list.movieId == movie_id),'Year'])[0]  # obtaining the year it is released
    
    # Obtaining the scores for that movie
    scores = np.array(cosine_mov.loc[title,:])
    
    # Obtaining the related movies
    similar = cosine_mov.loc[title,:][cosine_mov.loc[title,:] > sim].index
    
    similar2 = []
    for i in similar:   # Iterating similar movies to get the movies which are released in a 10 year window from the release year of movie Id
        if np.array(movies_list.loc[(movies_list.title == i),'Year'])[0] in range(year-5,year+5,1):
            similar2.append(i)
    
    return(similar2)

In [49]:
get_similar_movies(2959,0.95)

['Crossing Guard, The (1995)',
 'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
 'Clear and Present Danger (1994)',
 'Last Man Standing (1996)',
 'Face/Off (1997)',
 'Cop Land (1997)',
 'Corruptor, The (1999)',
 'Double Jeopardy (1999)',
 'Fight Club (1999)',
 'Boondock Saints, The (2000)',
 'Spy Game (2001)',
 'Dark Blue (2003)',
 'Cradle 2 the Grave (2003)',
 'Man Apart, A (2003)',
 'Infernal Affairs 2 (Mou gaan dou II) (2003)']

## Recommendation

### User's watch list 

In [50]:
## Obtaining Users watcher list as we should not suggest already watched movies.

In [51]:
def user_watch_list(user_id):
    index = list(rating_std.loc[(rating_std["userId"] == user_id),'movieId'])
    return([np.array(movies_list.loc[(movies_list['movieId']== i),].title)[0] for i in index])

In [52]:
user_watch_list(111)

['Father of the Bride Part II (1995)',
 'Casino (1995)',
 'Powder (1995)',
 'Babe (1995)',
 'Clueless (1995)',
 'Pocahontas (1995)',
 'Bio-Dome (1996)',
 'Happy Gilmore (1996)',
 'Bad Boys (1995)',
 'Mallrats (1995)',
 'Showgirls (1995)',
 'To Wong Foo, Thanks for Everything! Julie Newmar (1995)',
 'Shawshank Redemption, The (1994)',
 'Tank Girl (1995)',
 'Forrest Gump (1994)',
 'Lion King, The (1994)',
 'Addams Family Values (1993)',
 'Jurassic Park (1993)',
 'Mrs. Doubtfire (1993)',
 'Nightmare Before Christmas, The (1993)',
 'Brady Bunch Movie, The (1995)',
 'Aladdin (1992)',
 'Silence of the Lambs, The (1991)',
 'Beauty and the Beast (1991)',
 'Pretty Woman (1990)',
 'Space Jam (1996)',
 'Multiplicity (1996)',
 'Cable Guy, The (1996)',
 'Matilda (1996)',
 'Cool Runnings (1993)',
 'Mary Poppins (1964)',
 'Alice in Wonderland (1951)',
 'Die Hard (1988)',
 'Swingers (1996)',
 'Dirty Dancing (1987)',
 'Princess Bride, The (1987)',
 'Clockwork Orange, A (1971)',
 'Fantasia (1940)',
 '10

### Content Based recommender System

In [53]:
# obtaining top movies
n = 5
user = 1
top = top_movies(user,n)
print(top)

[3729, 2640, 2948, 2078, 1573]


In [54]:
# Obtain similar movies for top movies
sim_movies_set = set()
for i in top:
    sim_movies_set.update(set(get_similar_movies(i))) 

In [55]:
# Unwatched movie set
unwatched = sim_movies_set - set(user_watch_list(111))

In [56]:
# Recommended movies
print(list(unwatched)) 

['Shaft (1971)', 'Face/Off (1997)', '3 dev adam (Three Giant Men) (1973) ', 'Bullitt (1968)', "Logan's Run (1976)", 'Last Man Standing (1996)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Point Blank (1967)', 'Real McCoy, The (1993)', 'Léon: The Professional (a.k.a. The Professional) (Léon) (1994)', 'Tron (1982)', 'Flash Gordon (1980)', 'Cop Land (1997)', 'Clear and Present Danger (1994)', 'Patriot Games (1992)', 'Blood In, Blood Out (1993)', 'Superman (1978)', 'Getaway, The (1972)', 'Magnum Force (1973)', 'Thunderball (1965)', 'Double Jeopardy (1999)', 'Spy Game (2001)', 'Goldfinger (1964)', 'Dr. No (1962)', 'Juice (1992)', 'Hard-Boiled (Lat sau san taam) (1992)', 'Mad Max (1979)', 'From Russia with Love (1963)', 'Star Wars: Episode IV - A New Hope (1977)', 'Get Carter (1971)', 'Crossing Guard, The (1995)', 'French Connection II (1975)', 'Corruptor, The (1999)']


# Assignment - 2

# Collaborative Filtering

In [57]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date
from datetime import timedelta

In [58]:
movies = pd.read_csv("movies.csv")  # reading movie data set

In [59]:
rating = pd.read_csv("ratings.csv") # reading ratings data set

In [60]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [61]:
movies.shape, len(movies.title.unique())

((9742, 3), 9737)

## Creating dataframe

**Obtaining the unique movies using sets**

In [62]:
movie_set = set()   #Creating empty set
for i in movies["title"]:  #iterating all movies in title column
    movie_set.add(i)       # appending movies to set

In [63]:
len(movie_set)

9737

**Obtaining unique user Id's from rating data set**

In [64]:
user_set = set()           # creating empty set
for i in rating["userId"]:  # Iterating userIds
    user_set.add(i)         # Appending user Ids to set
print(user_set)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22

In [65]:
movies.head(6)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller


**Creating a new data frame with column names as unique movie names and rows as uniques user Ids**

In [66]:
new_dataframe = pd.DataFrame({'Users' : list(user_set) })    # Creating a data frame with column of uniques users
for movie in movie_set:
    new_dataframe[movie] = None                           # appending the data frame with columns of uniques user names and storing Null values to it

In [67]:
new_dataframe

Unnamed: 0,Users,Harvard Man (2001),"Brave One, The (2007)",District 13: Ultimatum (Banlieue 13 - Ultimatum) (2009),Code 46 (2003),This Must Be the Place (2011),"Pirate Movie, The (1982)",Hannah and Her Sisters (1986),Modern Times (1936),"Big Red One, The (1980)",...,Oculus (2013),Beauty and the Beast (2017),Moulin Rouge (2001),Moonraker (1979),Transformers: The Movie (1986),Dog Day Afternoon (1975),Be Cool (2005),To Live and Die in L.A. (1985),Requiem for the American Dream (2015),Along Came Polly (2004)
0,1,,,,,,,,,,...,,,,,,,,,,
1,2,,,,,,,,,,...,,,,,,,,,,
2,3,,,,,,,,,,...,,,,,,,,,,
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,606,,,,,,,,,,...,,,,,,,,,,
606,607,,,,,,,,,,...,,,,,,,,,,
607,608,,,,,,,,,,...,,,,,,,,,,
608,609,,,,,,,,,,...,,,,,,,,,,


In [68]:
new_dataframe = new_dataframe.set_index("Users")    # Setting index with Userid

In [69]:
new_dataframe

Unnamed: 0_level_0,Harvard Man (2001),"Brave One, The (2007)",District 13: Ultimatum (Banlieue 13 - Ultimatum) (2009),Code 46 (2003),This Must Be the Place (2011),"Pirate Movie, The (1982)",Hannah and Her Sisters (1986),Modern Times (1936),"Big Red One, The (1980)",Shadow of the Thin Man (1941),...,Oculus (2013),Beauty and the Beast (2017),Moulin Rouge (2001),Moonraker (1979),Transformers: The Movie (1986),Dog Day Afternoon (1975),Be Cool (2005),To Live and Die in L.A. (1985),Requiem for the American Dream (2015),Along Came Polly (2004)
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


**Filling the ratings of the movies for respective user**

In [70]:
for i in user_set:                                      # Iterating user set
    for movie in rating.loc[(rating.userId == i),'movieId']:    # iterating movies of a particular user set
        movie_title = np.array(movies.loc[(movies.movieId == movie),'title'])[0]  # obtaining title 
        new_dataframe.at[i,movie_title] = np.array(rating.loc[(rating.movieId == movie),'rating'])[0]   # Storing the rating in the respective column

In [71]:
new_dataframe

Unnamed: 0_level_0,Harvard Man (2001),"Brave One, The (2007)",District 13: Ultimatum (Banlieue 13 - Ultimatum) (2009),Code 46 (2003),This Must Be the Place (2011),"Pirate Movie, The (1982)",Hannah and Her Sisters (1986),Modern Times (1936),"Big Red One, The (1980)",Shadow of the Thin Man (1941),...,Oculus (2013),Beauty and the Beast (2017),Moulin Rouge (2001),Moonraker (1979),Transformers: The Movie (1986),Dog Day Afternoon (1975),Be Cool (2005),To Live and Die in L.A. (1985),Requiem for the American Dream (2015),Along Came Polly (2004)
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,4,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,1,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,1.5,4,,,...,,,1,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,1,,,2.5,,,,
609,,,,,,,,,,,...,,,,,,,,,,


**Identified the columns which are not rated by any user and removing them**

In [72]:
to_be_removed = []    
for i in new_dataframe:
    if new_dataframe[i].isnull().sum() == 610:     # Calculating the columns with complete null values
        to_be_removed.append(i)
len(to_be_removed)

18

In [73]:
new_dataframe = new_dataframe.drop(to_be_removed,axis=1)    # Dropping the columns

In [74]:
new_dataframe.isnull().sum()

Harvard Man (2001)                                         609
Brave One, The (2007)                                      608
District 13: Ultimatum (Banlieue 13 - Ultimatum) (2009)    608
Code 46 (2003)                                             609
This Must Be the Place (2011)                              609
                                                          ... 
Dog Day Afternoon (1975)                                   572
Be Cool (2005)                                             602
To Live and Die in L.A. (1985)                             609
Requiem for the American Dream (2015)                      609
Along Came Polly (2004)                                    586
Length: 9719, dtype: int64

In [75]:
new_dataframe_to_filled = new_dataframe.copy() ## Dataframe saved which can be used later

**Normalizing the ratings of all the movies by subtracting with the mean which will handle easy and tough raters**

In [76]:
#Normalizing the ratings by subtracting the ratings with the mean

new_dataframe["mean"] = new_dataframe.mean(axis = 1)
new_dataframe = new_dataframe.sub(new_dataframe["mean"], axis = 0)
new_dataframe = new_dataframe.drop(["mean"], axis = 1)
new_dataframe

Unnamed: 0_level_0,Harvard Man (2001),"Brave One, The (2007)",District 13: Ultimatum (Banlieue 13 - Ultimatum) (2009),Code 46 (2003),This Must Be the Place (2011),"Pirate Movie, The (1982)",Hannah and Her Sisters (1986),Modern Times (1936),"Big Red One, The (1980)",Shadow of the Thin Man (1941),...,Oculus (2013),Beauty and the Beast (2017),Moulin Rouge (2001),Moonraker (1979),Transformers: The Movie (1986),Dog Day Afternoon (1975),Be Cool (2005),To Live and Die in L.A. (1985),Requiem for the American Dream (2015),Along Came Polly (2004)
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,-0.366379,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,-2.71759,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,-2.09193,0.408072,,,...,,,-2.59193,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,-2.33514,,,-0.835138,,,,
609,,,,,,,,,,,...,,,,,,,,,,


**Filling the Null values with 0**

In [77]:
matrix = new_dataframe.copy()

In [78]:
for column in matrix:
    matrix[column] = matrix[column].fillna(0)   #filling the remaining values with 0

In [79]:
matrix

Unnamed: 0_level_0,Harvard Man (2001),"Brave One, The (2007)",District 13: Ultimatum (Banlieue 13 - Ultimatum) (2009),Code 46 (2003),This Must Be the Place (2011),"Pirate Movie, The (1982)",Hannah and Her Sisters (1986),Modern Times (1936),"Big Red One, The (1980)",Shadow of the Thin Man (1941),...,Oculus (2013),Beauty and the Beast (2017),Moulin Rouge (2001),Moonraker (1979),Transformers: The Movie (1986),Dog Day Afternoon (1975),Be Cool (2005),To Live and Die in L.A. (1985),Requiem for the American Dream (2015),Along Came Polly (2004)
Users,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,-0.366379,0.000000,0.0,0.0,0.0,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,-2.717593,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
5,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,-2.091928,0.408072,0.0,0.0,...,0.000000,0.0,-2.591928,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
607,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
608,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,-2.335138,0.0,0.000000,-0.835138,0.0,0.0,0.0,0.000000
609,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000


## User-User Similarity Matrix

**Finding user-user cosine similarity matrix**

In [80]:
from sklearn.metrics.pairwise import cosine_similarity

In [81]:
cosine_user = pd.DataFrame(cosine_similarity(matrix))  # By using cosine similarity created a matrix

In [82]:
cosine_user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
0,1.000000,0.022765,0.047222,0.160791,0.070811,0.076131,0.089966,0.068709,0.032004,0.027493,...,0.051529,0.083125,0.183383,0.022091,0.098608,0.141463,0.244212,0.225901,0.030703,0.076935
1,0.022765,1.000000,0.000000,0.004467,0.024211,0.026834,0.020980,0.024666,0.000000,0.068324,...,0.112927,0.011860,0.011778,0.000000,0.000000,0.015656,0.010608,0.027037,0.029979,0.051009
2,0.047222,0.000000,1.000000,0.030707,0.034665,0.079550,0.000000,0.035279,0.000000,0.000000,...,0.080050,0.063468,0.116653,0.000000,0.015086,0.100687,0.109959,0.101195,0.000000,0.055535
3,0.160791,0.004467,0.030707,1.000000,0.099680,0.170391,0.105131,0.133128,0.014235,0.037584,...,0.149480,0.169786,0.372273,0.045037,0.057085,0.243993,0.154488,0.199733,0.024988,0.110454
4,0.070811,0.024211,0.034665,0.099680,1.000000,0.317393,0.058818,0.351295,0.000000,0.009219,...,0.058829,0.370368,0.091076,0.156746,0.087680,0.113728,0.129757,0.100467,0.147077,0.055803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,0.141463,0.015656,0.100687,0.243993,0.113728,0.116255,0.227238,0.077098,0.078190,0.091954,...,0.141418,0.142690,0.360114,0.077995,0.135691,1.000000,0.135030,0.280480,0.052228,0.201210
606,0.244212,0.010608,0.109959,0.154488,0.129757,0.110672,0.132924,0.129636,0.019617,0.026853,...,0.034411,0.124003,0.233322,0.079584,0.090543,0.135030,1.000000,0.292759,0.067227,0.132578
607,0.225901,0.027037,0.101195,0.199733,0.100467,0.218487,0.295067,0.172539,0.114154,0.067092,...,0.089809,0.170286,0.254841,0.172859,0.151964,0.280480,0.292759,1.000000,0.078860,0.292272
608,0.030703,0.029979,0.000000,0.024988,0.147077,0.138792,0.017055,0.256092,0.000000,0.007398,...,0.012967,0.196404,0.051133,0.121288,0.024632,0.052228,0.067227,0.078860,1.000000,0.033077


In [83]:
cosine_user.index = list(user_set)   # Setting the index
cosine_user.columns = list(user_set) # Setting the columns

In [84]:
cosine_user.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
count,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,...,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0
mean,0.091142,0.03234,0.045758,0.108506,0.100162,0.112155,0.108594,0.13266,0.0385,0.071466,...,0.103542,0.129576,0.120322,0.076012,0.076283,0.12504,0.112675,0.168133,0.062269,0.119625
std,0.068208,0.052114,0.060462,0.077879,0.112012,0.122986,0.086641,0.152479,0.057266,0.075976,...,0.089173,0.140368,0.086464,0.102188,0.06786,0.082536,0.078729,0.106438,0.089277,0.091843
min,-0.003496,-0.000352,-0.000881,0.0,-0.002315,-0.001787,-0.003291,-0.011389,-0.005013,-0.000368,...,-0.000575,-0.001134,0.0,-0.007913,-0.000268,0.00256,0.0,0.0,-0.004804,0.0
25%,0.045163,0.0,0.0,0.054674,0.021418,0.030283,0.045895,0.03046,0.0,0.010679,...,0.036385,0.033688,0.060354,0.008894,0.033975,0.066906,0.057836,0.096186,0.011214,0.05316
50%,0.077189,0.020855,0.033779,0.096291,0.069611,0.067586,0.086457,0.083006,0.026151,0.057425,...,0.084419,0.081865,0.100154,0.040974,0.061801,0.106394,0.102333,0.146971,0.031887,0.096338
75%,0.126561,0.046225,0.074502,0.150546,0.121908,0.145525,0.159841,0.159334,0.05996,0.109002,...,0.154628,0.167401,0.154537,0.095989,0.10244,0.17162,0.151702,0.216964,0.072026,0.159846
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [85]:
np.fill_diagonal(cosine_user.values,0)   # Converting diagonal elements to 0 

In [86]:
cosine_user.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
count,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,...,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0,610.0
mean,0.089503,0.0307,0.044119,0.106867,0.098523,0.110516,0.106955,0.13102,0.036861,0.069827,...,0.101903,0.127937,0.118683,0.074373,0.074643,0.1234,0.111036,0.166494,0.060629,0.117986
std,0.057506,0.034312,0.046489,0.069114,0.105976,0.117683,0.078858,0.148461,0.041966,0.066048,...,0.08153,0.135956,0.078908,0.095117,0.056663,0.074687,0.070168,0.101176,0.080809,0.084754
min,-0.003496,-0.000352,-0.000881,0.0,-0.002315,-0.001787,-0.003291,-0.011389,-0.005013,-0.000368,...,-0.000575,-0.001134,0.0,-0.007913,-0.000268,0.0,0.0,0.0,-0.004804,0.0
25%,0.044892,0.0,0.0,0.054513,0.021131,0.030122,0.045568,0.029815,0.0,0.010611,...,0.036025,0.033435,0.060145,0.008666,0.033789,0.06668,0.057429,0.095357,0.011112,0.053092
50%,0.077031,0.020688,0.033586,0.096143,0.06946,0.067547,0.085774,0.082693,0.025776,0.056919,...,0.083911,0.080627,0.100084,0.040833,0.061607,0.106298,0.101798,0.146489,0.031613,0.095953
75%,0.126437,0.046003,0.074104,0.150251,0.120912,0.145025,0.158414,0.158935,0.059603,0.108413,...,0.153531,0.166238,0.154148,0.095598,0.101877,0.17124,0.15154,0.215711,0.071478,0.159263
max,0.261144,0.175751,0.214972,0.412834,0.481064,0.605812,0.369638,0.75315,0.261869,0.360043,...,0.423416,0.635778,0.397945,0.576011,0.299346,0.425538,0.375166,0.55859,0.513422,0.436204


## Defining some useful functions

**Creating a function which takes inputs(user_id and n), here n is number of similar movies required with highest cosine similarity value with the passed user id.**

In [87]:
def similar_users(user_id,nearest = 5):    # function head user_id : user for which whom we want to suggest, nearest : number of nearest cosine values 
    cos_dict = {}         # empty dic for storing key as id and value as cosine value
    j = 1
    for i in cosine_user.loc[user_id]:   # iterating user ids
        cos_dict[j] = i                  # Saving the dictionary with j(which is the equal to user id) as key and cosine value as column
        j += 1
    sorted_dict = sorted(cos_dict.items(), key=lambda x:x[1],reverse=True)   # sorting the dictionary on the basis of values
    sortdict = dict(sorted_dict)         # converting to dict
    count = 1
    similar_ids = []
    for i in sortdict:              # obtaining the top n similar user ids
        if count <= nearest:
            similar_ids.append(i)
            count += 1
    return(similar_ids)             # Returning the movie Ids

In [88]:
similar_list = similar_users(20,7)

In [89]:
similar_list

[525, 177, 381, 169, 274, 380, 68]

**Creating a function which takes inputs of userId, similar user list and number of movies to be suggested**

**Output will be top n movies suggested as per the rating with respect to the given user**

In [90]:
def suggested_movies(user_id,similar_list,n = 20):  # function def which takes the input as userid,similar user list and number of suggested movies
    user_data = new_dataframe_to_filled.loc[user_id].copy()
    watched_list = list(user_data.dropna().index)    # obtaining the watches list
    similar_list.append(user_id)                   # Adding the user_id to the similar list
    similar_dataframe = new_dataframe_to_filled.loc[similar_list,].copy()   # Obtaining the dataframe with similar list and user_id
    for movie in similar_dataframe:
        if similar_dataframe.at[user_id,movie] == None:   # determining none values and replacing with mean of sim movies
            similar_dataframe.at[user_id,movie] = np.mean(similar_dataframe[movie])
    rated_movies = list(similar_dataframe.loc[user_id].dropna().index)   # obtaining the movie id which contains ratings
    unwatched = set(rated_movies) -  set(watched_list)    # using sets to obtain unwatched list
    final_suggestion = {}
    for movie in unwatched:                   # creating dict by iterating unwatched
        final_suggestion[movie] = similar_dataframe.at[user_id,movie]   #appending to dict
    sorted_final_suggestion = dict(sorted(final_suggestion.items(), key=lambda x:x[1],reverse=True))  # sorting the final suggestion of movies
    sorted_final_suggestion_list = []
    for i in sorted_final_suggestion:
        sorted_final_suggestion_list.append(i)    # list containing the movies in descending order of ratings
    return(sorted_final_suggestion_list[0:n])     # Returning only first n suggestions

In [91]:
suggested_movies(1,similar_list)

["Adam's Rib (1949)",
 'Education, An (2009)',
 'Casablanca (1942)',
 'Vertigo (1958)',
 'Ghost (1990)',
 "She's Out of My League (2010)",
 'Wonder Woman (2009)',
 'Metropolis (2001)',
 'Very Potter Sequel, A (2010)',
 'A Detective Story (2003)',
 'Dredd (2012)',
 'Solaris (2002)',
 'Brigadoon (1954)',
 'Perks of Being a Wallflower, The (2012)',
 'Waking Life (2001)',
 'Faster (2010)',
 'Persuasion (1995)',
 'Rango (2011)',
 'Cop Out (2010)',
 'Notebook, The (2004)']

## Recommended Movies

In [92]:
user_id = 10
nearest = 20
n = 30

In [93]:
# Using functions to obtain the suggested movies
sim_list = similar_users(user_id,nearest)
suggested_movies(user_id,sim_list,n)

['Sleeper (1973)',
 'Pinocchio (1940)',
 'Face/Off (1997)',
 'L.A. Confidential (1997)',
 'Ghost (1990)',
 "She's Out of My League (2010)",
 'Full Metal Jacket (1987)',
 'Dogma (1999)',
 'Dreamers, The (2003)',
 'Blues Brothers, The (1980)',
 'Dredd (2012)',
 'Solaris (2002)',
 'Perks of Being a Wallflower, The (2012)',
 'Back to the Future (1985)',
 'Willy Wonka & the Chocolate Factory (1971)',
 'Persuasion (1995)',
 'Rango (2011)',
 'Secret of NIMH, The (1982)',
 'Congo (1995)',
 'Guys and Dolls (1955)',
 'Jiro Dreams of Sushi (2011)',
 'Aliens (1986)',
 'Monty Python and the Holy Grail (1975)',
 'Citizen Kane (1941)',
 'Dr. No (1962)',
 'Run Lola Run (Lola rennt) (1998)',
 'Idiocracy (2006)',
 'Oliver! (1968)',
 'Paperman (2012)',
 'I Origins (2014)']