In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import hamming 

In [41]:
u_data = pd.read_csv('u.data', sep='\t', header=None, 
                     names=['user id','movie id', 'rating','timestamp'], 
                     encoding='iso-8859-1')

item_columns = ['movie id','movie title','release date','video release date','IMDb URL', 'unknown', 'Action', 
                'Adventure', 'Animation',"Children's",'Comedy', 'Crime', 'Documentary','Drama' ,'Fantasy',
                'Film-Noir' , 'Horror' ,'Musical' , 'Mystery' , 'Romance' ,'Sci-Fi', 'Thriller' ,'War' , 'Western']

u_item = pd.read_csv('u.item', sep='|', header=None, 
                     names=item_columns, 
                     encoding='iso-8859-1')

In [42]:
df = pd.merge(left=u_data, right=u_item, how='left', on=['movie id'])

In [43]:
df = df.sort_values('user id').reset_index().drop(columns=['index'])

In [44]:
df.to_csv('cleaned_movielens.csv')

In [45]:
df.columns

Index(['user id', 'movie id', 'rating', 'timestamp', 'movie title',
       'release date', 'video release date', 'IMDb URL', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [46]:
df = df[['user id', 'movie id', 'rating','movie title']]

In [47]:
df.head(5)

Unnamed: 0,user id,movie id,rating,movie title
0,1,55,5,"Professional, The (1994)"
1,1,203,4,Unforgiven (1992)
2,1,183,5,Alien (1979)
3,1,150,5,Swingers (1996)
4,1,68,4,"Crow, The (1994)"


In [48]:
def Favorite_Movie(userId,N):
    userRatings = df[df["user id"] == userId]
    SortedRating = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N]
    return SortedRating

In [49]:
Favorite_Movie(10, 5)

Unnamed: 0,user id,movie id,rating,movie title
1282,10,133,5,Gone with the Wind (1939)
1414,10,602,5,"American in Paris, An (1951)"
1350,10,603,5,Rear Window (1954)
1406,10,50,5,Star Wars (1977)
1409,10,611,5,Laura (1944)


In [50]:
user_per_movie = df['movie id'].value_counts()
movie_per_user = df['user id'].value_counts()

In [51]:
df = df[df["movie id"].isin(user_per_movie[user_per_movie > 100].index)] # movies on the okay-to-recommend list 

In [52]:
df = df[df['user id'].isin(movie_per_user[movie_per_user > 50].index)] # real users

In [53]:
User_Movie_Rating_Matrix = pd.pivot_table(df, values='rating', index=['user id'], columns=['movie id'])

In [54]:
User_Movie_Rating_Matrix

movie id,1,2,4,7,8,9,11,12,13,14,...,815,845,866,879,895,926,928,1016,1028,1047
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,3.0,4.0,1.0,5.0,2.0,5.0,5.0,5.0,...,,,,,,,,,,
2,4.0,,,,,,,,4.0,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
6,4.0,,,2.0,4.0,4.0,,4.0,2.0,5.0,...,,,,,,,,,,
7,,,5.0,5.0,5.0,5.0,3.0,5.0,,,...,,,,,,,,,,
8,,,,3.0,,,3.0,,,,...,,,,,,,,,,
10,4.0,,4.0,4.0,,4.0,4.0,5.0,3.0,,...,,,,,,,,,,
11,,,,,4.0,5.0,2.0,2.0,,,...,,,,,,,,,,
12,,,5.0,,,,,,,,...,,,,,,,,,,


In [55]:
def distance(user_1, user_2):
    try:
        user1_Ratings = User_Movie_Rating_Matrix.transpose()[user_1]
        user2_Ratings = User_Movie_Rating_Matrix.transpose()[user_2]
        distance = hamming(user1_Ratings,user2_Ratings)
    except: 
        distance = np.NaN
    return distance

In [56]:
distance(70, 23)

0.9221556886227545

In [57]:
user = 70
allUsers = pd.DataFrame(User_Movie_Rating_Matrix.index)
allUsers = allUsers[allUsers['user id'] != user]

In [58]:
allUsers["distance"] = allUsers["user id"].apply(lambda x: distance(user,x))

In [59]:
allUsers

Unnamed: 0,user id,distance
0,1,0.943114
1,2,0.988024
2,3,0.997006
3,5,0.940120
4,6,0.946108
5,7,0.934132
6,8,0.973054
7,10,0.961078
8,11,0.949102
9,12,0.976048


In [60]:
def K_Nearest_Neighbors(user_id, K=10):
    All_Users = pd.DataFrame(User_Movie_Rating_Matrix.index)
    All_Users = All_Users[All_Users['user id'] != user_id]
    All_Users['distance'] = All_Users['user id'].apply(lambda x: distance(user,x))
    KnearestUsers = All_Users.sort_values(["distance"],ascending=True)['user id'][:K]
    return KnearestUsers

In [61]:
KnearestUsers = K_Nearest_Neighbors(70)

In [62]:
NNRatings = User_Movie_Rating_Matrix[User_Movie_Rating_Matrix.index.isin(KnearestUsers)]
NNRatings

movie id,1,2,4,7,8,9,11,12,13,14,...,815,845,866,879,895,926,928,1016,1028,1047
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
92,4.0,3.0,4.0,4.0,5.0,4.0,4.0,5.0,4.0,,...,,3.0,,,,3.0,3.0,2.0,2.0,1.0
94,4.0,,4.0,4.0,5.0,5.0,5.0,4.0,,,...,,,,,,,3.0,,2.0,
222,4.0,3.0,3.0,5.0,1.0,5.0,5.0,5.0,,,...,2.0,3.0,,,4.0,,,3.0,,
276,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,,4.0,...,,4.0,,3.0,,,3.0,3.0,3.0,3.0
308,4.0,,5.0,4.0,5.0,4.0,5.0,5.0,,,...,,,,,,,4.0,,2.0,3.0
334,,,3.0,5.0,4.0,4.0,4.0,5.0,3.0,3.0,...,3.0,2.0,3.0,3.0,,,,3.0,,
378,4.0,2.0,3.0,4.0,4.0,5.0,3.0,5.0,3.0,5.0,...,,3.0,2.0,,,1.0,2.0,,2.0,2.0
429,3.0,3.0,4.0,2.0,3.0,,4.0,5.0,,,...,,4.0,,,,,2.0,4.0,3.0,
435,5.0,4.0,4.0,4.0,3.0,4.0,5.0,5.0,,,...,,3.0,,,3.0,3.0,3.0,4.0,2.0,3.0
916,4.0,3.0,4.0,4.0,,5.0,4.0,4.0,,5.0,...,,,3.0,,,,,,,


In [63]:
df = df.set_index('movie id')

In [65]:
def MovieMeta(movieid):
    title = df.at[movieid,'movie title']
    return title

In [66]:
def topN(user,N=3):
    KnearestUsers = K_Nearest_Neighbors(user)
    NNRatings = User_Movie_Rating_Matrix[User_Movie_Rating_Matrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    MoviesAlreadyWatched = User_Movie_Rating_Matrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(MoviesAlreadyWatched)]
    topNMovies = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topNMovies).apply(MovieMeta)

In [68]:
topN(10)

0    [Schindler's List (1993), Schindler's List (19...
1    [Wrong Trousers, The (1993), Wrong Trousers, T...
2    [Edge, The (1997), Edge, The (1997), Edge, The...
Name: movie id, dtype: object