# Movie Recommendation System

Predict movie ratings for the MovieLens dataset

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics.pairwise import cosine_similarity

warnings.simplefilter(action='ignore', category=FutureWarning)

#Insert file
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols)

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols)

i_cols = ['movie_id', 'movie_title' ,'release_date','video_release_date', 'imdb_url', 'unknown', 'action', 'adventure',
'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'noir', 'horrow', 'musical', 'mystery', 'romance', 'scifi', 'thriller', 'war', 'western']

items = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')

## Collaborative Filtering

### Transform function

In [2]:
#Transform dataframe into matrix
#Input: name of ratings table and whether we want to apply use-based or iltem-based filtering

def transform(df=ratings, user_base=True):
    
    global rating_matrix
    global similarity_matrix
    global mean
    
    if user_base == True:
        table = pd.pivot_table(ratings, values='rating', index='user_id', columns='movie_id')
        
    elif user_base == False:
        table = pd.pivot_table(ratings, values='rating', index='movie_id', columns='user_id')
        
    #Normalize the table and turn it into rating matrix
    mean = table.mean(axis=1)
    rating_matrix = table.subtract(mean, axis='index').fillna(0)
    rating_matrix = rating_matrix.values
    
    #Create similarity matrix
    results = []
    for i in range(len(rating_matrix)):
        for j in range(len(rating_matrix)):
            matrix = np.row_stack((rating_matrix[i], rating_matrix[j]))
            results.append(cosine_similarity(matrix)[0,1])
    
    # Change the similarity score of the same user/istem to a large negative number; we should not factor that in during the score prediction
    similarity_matrix = results
    similarity_matrix = [-99 if i > 0.999 else i for i in similarity_matrix]
    similarity_matrix = np.reshape(similarity_matrix, (-1, len(rating_matrix)))
       
    return print("Transformation complete")

### Recommendation function

In [3]:
#Function for movie recommendation
#Input values: rating matrix, similarity matrix, user id, # of other similar users

def movies(rating, similarity, user_id=1, k=5):
    
    global score
    
    score = []

    for i in range(len(rating)):
        
        #Obtain rating only from k-nearest users
        top = np.argsort(-similarity[i])[:k]
        
        
        #Calculate prediction based on normalized rating   
        #To avoid the 0 division error
        
        similarity_total = np.sum(similarity[i][[top]])
        
        if similarity_total != 0:
            prediction = np.dot(rating[[top]].T, similarity[i][[top]])/ similarity_total
        else:
            prediction = np.zeros(rating.shape[1])
     
        #Add prediction back to the actual average
        score.append(mean.values[i]+prediction)
        
        
    #Find out array location of movie with the highest prediction score for user
    top_movies = np.argsort(-score[user_id-1])[:5]
        
    return print('Our top 5 picks for user {} are: {}'.format(user_id, items.movie_title[top_movies].values))


### Compare results between user-based and item-based neighborhood technique

In [4]:
transform(ratings)

Transformation complete


In [5]:
movies(rating_matrix, similarity_matrix, 1, 10)

Our top 5 picks for user 1 are: ['Empire Strikes Back, The (1980)' 'Star Wars (1977)' 'Fargo (1996)'
 'Shawshank Redemption, The (1994)' 'Godfather, The (1972)']


In [6]:
transform(ratings, False)

Transformation complete


In [7]:
movies(rating_matrix, similarity_matrix, 1,10)

Our top 5 picks for user 1 are: ['Grifters, The (1990)' '187 (1997)'
 'Star Trek IV: The Voyage Home (1986)' 'Kansas City (1996)'
 'Chamber, The (1996)']


---
## Misc Codes
These codes will be deleted in the final version

In [None]:
# #Vanilla Veresion

# score = []
# for i in range(len(rating_matrix)):
    
#     np.argsort
#     #Calculate the prediction
#     prediction = np.nansum(rating_matrix.T* similarity_matrix[i], axis=1) / np.nansum(similarity_matrix[i])
#     score.append(prediction + df_mean.values[i])

In [None]:
# #Function for cosine_similiarity
# def cosine(matrix):
#     dot_prod = (matrix[0] * matrix[1]).sum()
#     a_sum = (matrix[0]**2).sum()
#     b_sum = (matrix[1]**2).sum()
#     results = dot_prod / sqrt(a_sum*b_sum)
#     return results

In [None]:
# # results = []
# for i in range(len(rating_matrix)):
#     for j in range(len(rating_matrix)):
#         matrix = np.row_stack((rating_matrix[i], rating_matrix[j]))
#         matrix = np.delete(matrix, np.argwhere(np.isnan(matrix[0])), 1)
#         matrix = np.delete(matrix, np.argwhere(np.isnan(matrix[1])), 1)
#         try:
#             results.append(cosine_similarity(matrix)[0,1])
#         except:
#             results.append(np.nan)

In [None]:
# start_time = time.time()
# results = []
# for i in range(len(test)):
#     for j in range(len(test)):
#         matrix = np.row_stack((test[i], test[j]))
#         matrix = np.delete(matrix, np.argwhere(np.isnan(matrix[0])), 1)
#         matrix = np.delete(matrix, np.argwhere(np.isnan(matrix[1])), 1)
#         print(cosine_similarity(matrix)[0,1])
        
# print("--- %s seconds ---" % (time.time() - start_time))