In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score


In [2]:
ratings = pd.read_csv(r"Z:\WBS_DS_Bootcamp\project_8\Data\ml-latest-small\ratings.csv")
movies =pd.read_csv("Z:\WBS_DS_Bootcamp\project_8\Data\ml-latest-small\movies.csv")
tags = pd.read_csv(r"Z:\WBS_DS_Bootcamp\project_8\Data\ml-latest-small\tags.csv")
links =pd.read_csv("Z:\WBS_DS_Bootcamp\project_8\Data\ml-latest-small\links.csv")

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


##  1.Popularity based recommender system

In [5]:
def popular_movies(ratings,movies):
    # group movies by rating (explicit rating)
    rating_mean = pd.DataFrame(ratings.groupby('movieId')['rating'].mean().sort_values(ascending=False))

    # group movies by rating_count (implicit rating )
    rating_count= pd.DataFrame(ratings.groupby('movieId')['userId'].count().sort_values(ascending=False))
    rating_count.rename(columns={'userId': 'rating_counts'}, inplace=True)

    # merge both the dataframes rating_mean and rating_count
    ratings_data = rating_mean.merge(rating_count, left_on= 'movieId', right_on = 'movieId')

    # create a Dataframe with movies and ratings_data
    Top5_movies= movies.merge(ratings_data, left_on= 'movieId', right_on = 'movieId').sort_values('rating_counts', ascending=False)
    return Top5_movies.head()

In [6]:
popular_movies(ratings,movies)

Unnamed: 0,movieId,title,genres,rating,rating_counts
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.164134,329
277,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.16129,279
1938,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.192446,278


## 2. Similarity based recommender system

In [7]:
def similar_movies(movie_name,number_of_movies):
    # merge movies and ratings and create a similarity_matrix
    movies_ratings = movies.merge(ratings, left_on = 'movieId', right_on = 'movieId')
    similarity_matrix =pd.DataFrame(pd.pivot_table(data= movies_ratings, values='rating', index='userId', columns='movieId'))
    # take movie_name as input and find the movie_id
    movie_id = movies[movies['title'].str.contains(movie_name)].reset_index().head(1)
    movie_id= movie_id.movieId[0]
    movie_ratings = similarity_matrix[movie_id]
    movie_ratings[movie_ratings>=0]
    similar_to_movie = similarity_matrix.corrwith(movie_ratings)
    corr_movie_id = pd.DataFrame(similar_to_movie, columns=['PearsonR'])
    corr_movie_id .dropna(inplace=True)
    ratings['rating_count']= pd.DataFrame(ratings.groupby('movieId')['userId'].count().sort_values(ascending=False))
    movies_corr_summary = corr_movie_id.join(ratings['rating_count'])
    movies_corr_summary.drop(movie_id,inplace=True)
    top_movies = movies_corr_summary[movies_corr_summary['rating_count']>=100].sort_values('PearsonR', ascending=False)
    top_movies = top_movies.merge(movies, left_on='movieId', right_on ='movieId').head(number_of_movies)
    top = top_movies.reindex(columns=['movieId','title','genres','PearsonR','rating_count'])
    return top

In [8]:
similar_movies('Pocahontas',5)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0,movieId,title,genres,PearsonR,rating_count
0,587,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller,0.599045,115.0
1,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy,0.582899,102.0
2,329,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi,0.580232,108.0
3,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,0.558097,141.0
4,500,Mrs. Doubtfire (1993),Comedy|Drama,0.556479,144.0


## 3. Fully Personalised recommender system

In [9]:
# merge movies and ratings dataframes
data = ratings.merge(movies, on="movieId", how="left")
# create similarity matrix
movie_user = data.pivot_table(index='userId',columns='title',values='rating')
movie_user.fillna(0, inplace=True)

In [10]:
# find out all the positions different than 0
ratings_pos = pd.DataFrame(np.nonzero(np.array(movie_user))).T
ratings_pos.head()

Unnamed: 0,0,1
0,0,48
1,0,66
2,0,202
3,0,245
4,0,325


### create the train and test dataset

In [11]:
# split with train and test
train_pos, test_pos = train_test_split(ratings_pos, random_state=42, test_size=0.2)

In [12]:
# create an empty dataframe full of 0, with the same shape as the movie_user data
train = np.zeros(movie_user.shape)

# fill the set with the movie ratings based on the train positions
for pos in train_pos.values: 
    index = pos[0]
    col = pos[1]
    train[index, col] = movie_user.iloc[index, col]

train = pd.DataFrame(train, columns=movie_user.columns, index=movie_user.index)

In [13]:
# now it is time for the test set. We will follow the same process
test = np.zeros(movie_user.shape)

for pos in test_pos.values: 
    index = pos[0]
    col = pos[1]
    test[index, col] = movie_user.iloc[index, col]
    
test = pd.DataFrame(test, columns=movie_user.columns, index=movie_user.index)

In [14]:
true_test_ratings = []

for row in test_pos.iterrows():
    true_test_ratings.append(movie_user.iloc[int(row[1][0]), int(row[1][1])])

test_pos = test_pos.assign(true_rating = true_test_ratings)

### build the similarity matrix for the train test

In [15]:
# train the model
train_similarity = pd.DataFrame(cosine_similarity(train), columns=train.index, index=train.index)


In [16]:
# test the model
test_similarity = pd.DataFrame(cosine_similarity(test), columns=test.index, index=test.index)

In [17]:
true_rating = movie_user.loc[1, "Abyss, The (1989)"]
true_rating

4.0

## Computing all recommendations for the train set

In [18]:
def recommender(index_pos, column_pos, sim_df, data): 
    # build a df with the ratings for one movie (column_name) and
    # the similarities to one user (index_name)
    results = (pd.DataFrame({'ratings': data.iloc[:,column_pos], 'similarities' : sim_df.iloc[index_pos,:].tolist()}))
    
    # compute the weights
    results = results.assign(weights = results.similarities / (sum(results.similarities) -1))
    
    # compute the weighted ratings
    results = results.assign(weighted_ratings = results.ratings * results.weights)
    
    # return rating prediction for one user and one movie
    return results.weighted_ratings.sum()

In [19]:
recommender(0, 202, train_similarity, train)

0.608688505307779

In [20]:
test_pos

Unnamed: 0,0,1,true_rating
40035,273,5803,2.5
64954,413,9582,3.0
74022,473,4613,2.0
6187,41,6950,4.0
34900,232,9120,3.0
...,...,...,...
64628,413,8253,4.0
87784,565,9119,5.0
87799,566,397,2.0
29457,201,2139,4.0


In [None]:
# store the recommendations on a list
recs_test = []

for row in test_pos.iterrows():
    recs_test.append(
        recommender(
            index_pos = int(row[1][0]), 
            column_pos = int(row[1][1]), 
            sim_df = test_similarity, 
            data = test
        )
    )

In [None]:
test_pos = test_pos.assign(pred_rating = recs_test)
test_pos

### Performance metrices

In [None]:
# calculate the MAE
mean_absolute_error(test_pos.true_rating, test_pos.pred_rating)

In [None]:
# calculate the r2 score
r2_score(test_pos.true_rating, test_pos.pred_rating)

## Final step: DEAL WITH THE ZEROS!

And try to improve the performance.