First you need to check wheather your environment has scipy installed or not.
If not run "pip install scipy" in your terminal to install it and then run the code.

In [55]:
#A
#Import the datasets and print some of the values to check the importing is ok
import pandas as pd
from scipy.stats import pearsonr

#Load the 100k movie datasets

links = pd.read_csv('links.csv', sep = ',')
movies = pd.read_csv('movies.csv', sep = ',')
ratings = pd.read_csv('ratings.csv', sep = ',')
tags = pd.read_csv('tags.csv', sep = ',')

#Display first few rows of each datasets to check the importing
print("\nLinks dataset:")
print(links.head())

print("\nmovies dataset:")
print(movies.head())

print("\nRatings dataset:")
print(ratings.head())

print("\nTags dataset:")
print(tags.head())

num_of_ratings = len(ratings)
print(f"\n\nTotal number of ratings in the dataset is:{num_of_ratings}")


Links dataset:
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0

movies dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983

In [9]:
#B
#user based collaborative filtering approach using pearson correlation function to calculate similarities between users
#User item matrix
user_item_matrix = ratings.pivot(index = 'userId', columns = 'movieId', values = 'rating').fillna(0)

In [11]:
#Pearson Correlation function between two users
def pearson_correlation(user_1, user_2):
    common_movies = user_item_matrix.loc[user_1].index.intersection(user_item_matrix.loc[user_2].index)
    #Atleast 2 movies to compare
    if len(common_movies) <2:
        return 0
    user_1_ratings = user_item_matrix.loc[user_1][common_movies]
    user_2_ratings = user_item_matrix.loc[user_2][common_movies]
    correlation, _ = pearsonr(user_1_ratings, user_2_ratings)
    return correlation

#Test
user_1 = 44
user_2 = 188
print(f"Check the Pearson Correlation between {user_1} and {user_2} = {pearson_correlation(user_1,user_2)}")

Check the Pearson Correlation between 44 and 188 = -0.0046249672160630605


In [33]:
#Getting similar user
def get_similar_user(target_user):
    similarities = {}
    for user in user_item_matrix.index:
        if user != target_user:
            similarity = pearson_correlation(target_user, user)
            similarities[user] = similarity
    return similarities

In [34]:
#test the similarity function
target_user = 600
print(f"The similarities between {target_user} and all the other users are: {get_similar_user(target_user)}")

The similarities between 600 and all the other users are: {1: 0.26348073607017564, 2: 0.009476215521161388, 3: 0.00041563726687843597, 4: 0.1832692739564793, 5: 0.12458751478794813, 6: 0.1527884875902022, 7: 0.19819583083540315, 8: 0.15933034460084572, 9: 0.10773253816851812, 10: 0.0714213047724192, 11: 0.09784005317256406, 12: 0.06641221506383524, 13: 0.07292943502045261, 14: 0.1211159683701368, 15: 0.17641752288438095, 16: 0.19039747701381493, 17: 0.2314311779653644, 18: 0.20256011952933356, 19: 0.2875140111832335, 20: 0.22716875089206268, 21: 0.1129069152727071, 22: 0.1348454784901667, 23: 0.13296260121259268, 24: 0.12822719738026195, 25: 0.07078689467619342, 26: 0.11082731318544836, 27: 0.14230858730382406, 28: 0.1542286257549206, 29: 0.06874577341962924, 30: 0.10693563498528305, 31: 0.179863906325885, 32: 0.1451802659009671, 33: 0.14511570876924362, 34: 0.12219368085062068, 35: 0.0583467950102586, 36: 0.04551027045905543, 37: 0.11665720385608519, 38: 0.14422667877066964, 39: 0.229

In [35]:
#C
#Predicting movie score of a user using the prediction function taught in the class
def predict_movie_rating(target_user, movie):
    target_user_movies = user_item_matrix.loc[target_user].index
    common_users = user_item_matrix.loc[:, movie].dropna().index
    movie_similarities = {user: pearson_correlation(target_user, user) for user in common_users}
    weighted_sum = 0
    sum_of_similarities = 0
    
    for user, similarity in movie_similarities.items():
        rb = user_item_matrix.loc[user, movie]
        rb_bar = user_item_matrix.loc[user].mean()
        weighted_sum += similarity * (rb-rb_bar)
        sum_of_similarities += abs(similarity)
    
    ra_bar = user_item_matrix.loc[target_user].mean()
    if sum_of_similarities != 0:
        predicted_movie_rating = ra_bar + (weighted_sum / sum_of_similarities)
    else:
        predicted_movie_rating = ra_bar
        
    return predicted_movie_rating
    

In [36]:
#Test predicted movie ratings
target_user = 4
movie = 88
print(f"Predicted rating for user {target_user} on movie {movie}: {predict_movie_rating(target_user, movie)}")

Predicted rating for user 4 on movie 88: 0.09655699132940904


In [41]:
#D
#Top 10 similar user of a particular user
def get_top_similar_users(target_users, top_users=10):
    similarities = get_similar_user(target_user)
    top_users = sorted(similarities.items(), key=lambda x: x[1], reverse = True)[:top_users]
    return top_users

In [43]:
#Test similar users function
target_user = 88

print(f"Top 10 users for the user {target_user} are :\n{get_top_similar_users(target_user, 10)}")

Top 10 users for the user 88 are :
[(247, 0.32680750780806667), (378, 0.31366013952545246), (189, 0.30893643035962837), (233, 0.2987884389402532), (434, 0.2983242583031804), (581, 0.2888730811017719), (460, 0.28565134916000706), (417, 0.2809637189442898), (254, 0.2740586943913226), (61, 0.2736996884275945)]


In [48]:
#Top 10 recommended movies for the user
def user_based_movie_recommendations(target_user, top_movies = 10):
    top_users = get_top_similar_users(target_user)
    target_user_movies = user_item_matrix.loc[target_user].index
    recommended_movies = []
    for user, similarity in top_users:
        similar_user_movies = user_item_matrix,loc[user].index
        new_movies = [similar_user_movies] - [target_user_movies]
        recommended_movies.update(new_movies)
        if len(recommended_movies) >= top_movies:
            break
    
    return recommended_movies

In [53]:
#E
#Implemanting new recommender system (for thus case it is a predifned cosine similarity function)
import numpy as np
def cosine_similarity(user_1, user_2):
    common_movies = user_item_matrix.loc[user_1].index.intersection(user_item_matrix.loc[user_2].index)
    if len(common_movies) == 0:
        return 0
    user_1_ratings = user_item_matrix.loc[user_1][common_movies]
    user_2_ratings = user_item_matrix.loc[user_2][common_movies]
    similarity = np.dot(user_1_ratings, user_2_ratings) / (np.linalg.norm(user_1_ratings) * np.linalg.norm(user_2_ratings))
    return similarity

In [54]:
#Test the cosine similarity
user_1 = 55
user_2 = 59
print(f"Cosine similarity between user {user_1} and user {user_2}: {cosine_similarity(user_1, user_2)}")

Cosine similarity between user 55 and user 59: 0.01946719349585089


Cosine similarity is better because it uses preference in multi-dimentional space of the users to measure the similarity between them. Some of the others reasons are as follows: Cosine similarity tends to be affected lessly by extreme ratings, so quite efficient with a larger dataset. It also can be used where users actually rated only a small amount of movies. Besides this, using different kinds of similarity metrics have different impact on the quality of the recommendations of the movies.