In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from typing import Union
from math import sqrt
from IPython.display import display

In [None]:
# getting data
!wget -O moviedataset.zip https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip
print('unziping ...')
!unzip -o -j moviedataset.zip 

In [2]:
movies_df = pd.read_csv('movies.csv',header=0, names=['movie_id','title','genres'])
ratings_df = pd.read_csv('ratings.csv', header=0, names=['user_id', 'movie_id', 'rating', 'timestamp'])

In [3]:
display(movies_df)
display(ratings_df)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
34203,151697,Grand Slam (1967),Thriller
34204,151701,Bloodmoney (2010),(no genres listed)
34205,151703,The Butterfly Circus (2009),Drama
34206,151709,Zero (2015),Drama|Sci-Fi


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496
...,...,...,...,...
22884372,247753,49530,5.0,1430437962
22884373,247753,69481,3.0,1430437984
22884374,247753,74458,4.0,1430437968
22884375,247753,76093,5.0,1430437811


In [4]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

#Dropping the genres column
movies_df = movies_df.drop('genres', 1)

movies_df.head()

Unnamed: 0,movie_id,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [5]:
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [6]:
userInput= [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ]
inputmovies = pd.DataFrame(userInput)
inputmovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [7]:
inputmovies = pd.merge(movies_df[movies_df['title'].isin(inputmovies['title'])], inputmovies)
inputmovies.drop('year',1, inplace=True)
inputmovies

Unnamed: 0,movie_id,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [8]:
inputmovies

Unnamed: 0,movie_id,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [9]:
user_not_involved = ratings_df[ratings_df['movie_id'].isin(inputmovies['movie_id'])]
user_not_involved

Unnamed: 0,user_id,movie_id,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0
...,...,...,...
22883679,247738,296,4.0
22884132,247751,1,4.0
22884142,247751,296,4.0
22884164,247751,1274,5.0


In [10]:
user_subset_group = user_not_involved.groupby(['user_id'])

In [11]:
user_subset_group.get_group(1130)

Unnamed: 0,user_id,movie_id,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [12]:
# Sorting it so users with movie most in common with the input will have priority
user_subset_group = sorted(user_subset_group, key=lambda x: len(x[1]), reverse=True)

In [13]:
# limit enposed for going thorough every single user
user_subset_group = user_subset_group[0:100]

In [14]:
user_subset_group[0][1]

Unnamed: 0,user_id,movie_id,rating
7507,75,1,5.0
7508,75,2,3.5
7540,75,296,5.0
7633,75,1274,4.5
7673,75,1968,5.0


In [15]:
#  pearson_df is for scipy results
# pearsonDF is for courses result

pearsonCorrelationDict = {} # for course
pearson_df = pd.DataFrame({}, columns=['user_id', 'similarity_index']).astype({'user_id':int, 'similarity_index':float}) # for scipy implementation

for name, group in user_subset_group:
    tempdf = inputmovies[inputmovies['movie_id'].isin(group['movie_id'])]
    tempRatingList = tempdf['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    nRatings = len(group)
    
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
        
    #pearson correlation with scipy 
    p_value = stats.pearsonr(tempdf['rating'].values.tolist(), group['rating'])[0]
    pearson_df = pearson_df.append({'user_id':name,'similarity_index':p_value}, ignore_index=True)
    pearson_df['user_id'] = pearson_df['user_id'].apply(lambda x: int(x))
    
# convert dictionary to dataframe for course implementation
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarity_index']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))

In [16]:
display(pearsonDF) 
print("--"*10)
display(pearson_df) # scipy 

Unnamed: 0,similarity_index,user_id
0,0.827278,75
1,0.586009,106
2,0.832050,686
3,0.576557,815
4,0.943456,1040
...,...,...
95,0.537086,17854
96,0.877058,17897
97,0.271385,17944
98,0.298381,18301


--------------------


Unnamed: 0,user_id,similarity_index
0,75,0.827278
1,106,0.586009
2,686,0.832050
3,815,0.576557
4,1040,0.943456
...,...,...
95,17854,0.537086
96,17897,0.877058
97,17944,0.271385
98,18301,0.298381


In [17]:

topUsers=pearsonDF.sort_values(by='similarity_index', ascending=False)[0:50]
topUsersRating=topUsers.merge(ratings_df, left_on='user_id', right_on='user_id', how='inner')

topUsersRating['weightedRating'] = topUsersRating['similarity_index']*topUsersRating['rating']
tempTopUsersRating = topUsersRating.groupby('movie_id').sum()[['similarity_index','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']

recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movie_id'] = tempTopUsersRating.index

print("before using sort values: ")
display(recommendation_df)

print('--'*10, 'After using sort_values', sep='\n')
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
display(recommendation_df)

print('--'*10, 'Final correct result from course', sep='\n')
movies_df.loc[movies_df['movie_id'].isin(recommendation_df.head(10)['movie_id'].tolist())]

before using sort values: 


Unnamed: 0_level_0,weighted average recommendation score,movie_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.668955,1
2,2.518658,2
3,2.657941,3
4,3.000000,4
5,2.316058,5
...,...,...
148630,3.000000,148630
148652,2.000000,148652
149354,3.000000,149354
150776,3.000000,150776


--------------------
After using sort_values


Unnamed: 0_level_0,weighted average recommendation score,movie_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5073,5.0,5073
3329,5.0,3329
2284,5.0,2284
26801,5.0,26801
6776,5.0,6776
...,...,...
27777,0.5,27777
5462,0.5,5462
5917,0.5,5917
36531,0.5,36531


--------------------
Final correct result from course


Unnamed: 0,movie_id,title,year
2200,2284,Bandit Queen,1994
3243,3329,"Year My Voice Broke, The",1987
3669,3759,Fun and Fancy Free,1947
3679,3769,Thunderbolt and Lightfoot,1974
3685,3775,Make Mine Music,1946
4978,5073,"Son's Room, The (Stanza del figlio, La)",2001
6563,6672,War Photographer,2001
6667,6776,Lagaan: Once Upon a Time in India,2001
9064,26801,Dragon Inn (Sun lung moon hak chan),1992
18106,90531,Shame,2011


In [18]:
# using on scipy ones

# Top 50 users that are most similar to input
top_users = pearson_df.sort_values(by='similarity_index', ascending=False)[0:50]
top_users_rating =top_users.merge(ratings_df, left_on='user_id', right_on='user_id',how='inner')
top_users_rating['weighted_rating'] = top_users_rating['similarity_index'] * top_users_rating['rating']
top_users_rating = top_users_rating.groupby('movie_id').sum()[['similarity_index','weighted_rating']]
top_users_rating.columns = ['sum_similarityIndex', 'sum_weightedRating']

recommendation_df_sci = pd.DataFrame()
recommendation_df_sci['weighted average recommendation score'] = top_users_rating['sum_weightedRating'] / top_users_rating['sum_similarityIndex']
recommendation_df_sci['movie_id'] = top_users_rating.index
print("before using sort values: ")
display(recommendation_df_sci)
print('--'*10, 'After using sort_values', sep='\n')
recommendation_df_sci = recommendation_df_sci.sort_values(by='weighted average recommendation score', ascending=False)
display(recommendation_df_sci)
print('--'*10, 'Different result than course, using scipy: ', sep='\n')
movies_df.loc[movies_df['movie_id'].isin(recommendation_df_sci.head(10)['movie_id'])]

before using sort values: 


Unnamed: 0_level_0,weighted average recommendation score,movie_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.668955,1
2,2.518658,2
3,2.657941,3
4,3.000000,4
5,2.316058,5
...,...,...
148630,3.000000,148630
148652,2.000000,148652
149354,3.000000,149354
150776,3.000000,150776


--------------------
After using sort_values


Unnamed: 0_level_0,weighted average recommendation score,movie_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26801,5.0,26801
6918,5.0,6918
91199,5.0,91199
1902,5.0,1902
6660,5.0,6660
...,...,...
72043,0.5,72043
133810,0.5,133810
57532,0.5,57532
104667,0.5,104667


--------------------
Different result than course, using scipy: 


Unnamed: 0,movie_id,title,year
119,121,"Boys of St. Vincent, The",1992
1819,1902,Dream for an Insomniac,1996
3686,3776,Melody Time,1948
3759,3851,I'm the One That I Want,2000
6551,6660,"Red Shoes, The",1948
6559,6668,"Road Home, The (Wo de fu qin mu qin)",1999
6808,6918,"Unvanquished, The (Aparajito)",1957
9064,26801,Dragon Inn (Sun lung moon hak chan),1992
18106,90531,Shame,2011
18272,91199,Weekend,2011


In [19]:
# checking difference between them 
display(recommendation_df)
display(recommendation_df_sci)  # scipy

Unnamed: 0_level_0,weighted average recommendation score,movie_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5073,5.0,5073
3329,5.0,3329
2284,5.0,2284
26801,5.0,26801
6776,5.0,6776
...,...,...
27777,0.5,27777
5462,0.5,5462
5917,0.5,5917
36531,0.5,36531


Unnamed: 0_level_0,weighted average recommendation score,movie_id
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26801,5.0,26801
6918,5.0,6918
91199,5.0,91199
1902,5.0,1902
6660,5.0,6660
...,...,...
72043,0.5,72043
133810,0.5,133810
57532,0.5,57532
104667,0.5,104667
