### Movie Prediction by User Scores

- [Article Link](https://towardsdatascience.com/predict-movie-ratings-with-user-based-collaborative-filtering-392304b988af)

In [13]:
import pandas as pd
import numpy as np
import pandas_profiling
import io

In [39]:
# Constaints:
INPUT_PATH = 'C:\Data Sciences\Repo\Data-Sciences\Full Projects\Predict Movie Ratings\inputs'
OUTPUT_PATH = 'C:\Data Sciences\Repo\Data-Sciences\Full Projects\Predict Movie Ratings\outputs'

In [40]:
ratings_df = pd.read_csv(INPUT_PATH + '\small_movie_ratings.csv', index_col=0)

In [41]:
def analysis(df):
    
    
    print("------------DIMENSIONS ----------")
    print("Observation:", df.shape[0], "Column:", df.shape[1], "\n")

    print("--------------DTYPES------------- ")
    if len(df.select_dtypes("object").columns) > 0:
        print("Object Variables:", "\n", "variables:", 
              len(df.select_dtypes("object").columns), "\n", 
              df.select_dtypes("object").columns.tolist(), "\n")

    if len(df.select_dtypes("integer").columns) > 0:
        print("Integer Variables:", "\n", "variables:", 
              len(df.select_dtypes("integer").columns), "\n", 
              df.select_dtypes("integer").columns.tolist(), "\n")

    if len(df.select_dtypes("float").columns) > 0:
        print("Float Variables:", "\n", "variables:", 
              len(df.select_dtypes("float").columns), "\n", 
              df.select_dtypes("float").columns.tolist(), "\n")

    if len(df.select_dtypes("bool").columns) > 0:
        print("Bool Variables:", "\n", "variables:", 
              len(df.select_dtypes("bool").columns), "\n", 
              df.select_dtypes("bool").columns.tolist(), "\n")

    print("--------------MISSING VALUE----------")
    print("Any missing values? \n ", np.where(df.isnull().values.any() == False,  "No missing value❌", "Data includes missing value✔️"), "\n")

    buf = io.StringIO()
    df.info(buf=buf)
    info = buf.getvalue().split('\n')[-2].split(":")[1].strip()
    print("--------------MEMORY USAGE------------ \n", info)
    
    return df

df = analysis(ratings_df)
df.head()

------------DIMENSIONS ----------
Observation: 15 Column: 10 

--------------DTYPES------------- 
Float Variables: 
 variables: 10 
 ['15', '30', '311', '452', '468', '509', '547', '564', '624', '73'] 

--------------MISSING VALUE----------
Any missing values? 
  Data includes missing value✔️ 

--------------MEMORY USAGE------------ 
 1.3+ KB


Unnamed: 0_level_0,15,30,311,452,468,509,547,564,624,73
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
88 Minutes,2.0,4.0,2.0,,2.0,2.0,1.5,2.0,3.0,3.5
A Time to Kill,3.0,5.0,2.5,2.0,2.0,3.0,,4.0,3.0,3.0
Barry Lyndon,4.0,,3.0,3.0,3.0,3.5,4.0,5.0,4.0,2.5
Gleaming the Cube,2.5,4.0,4.0,,3.0,3.0,3.0,4.0,3.0,4.0
Judgment Night,4.0,4.0,4.0,4.0,3.5,3.0,2.5,1.0,,4.5


In [42]:
#ratings_df_profile = pandas_profiling.ProfileReport(ratings_df)
#ratings_df_profile.to_file('Ratings Dataframe Profile.html')

In [50]:
def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):

    """Find correlation between two users based on their rated movies using Pearson correlation"""

    rated_movies_by_both = ratings_df[[user1, user2]].dropna(axis=0).values
    user1_ratings = rated_movies_by_both[:, 0]
    user2_ratings = rated_movies_by_both[:, 1]
    return np.corrcoef(user1_ratings, user2_ratings)[0, 1]

users = list(ratings_df.columns)
movies = list(ratings_df.index)
similarity_matrix = np.array([[find_correlation_between_two_users(ratings_df, user1, user2) for user1 in users] for user2 in users])
similarity_df = pd.DataFrame(similarity_matrix, columns=users, index=users)
#similarity_df.to_csv(r'C:\Data Sciences\Repo\Data-Sciences\Full Projects\Predict Movie Ratings\outputs\User_Correlations_Behavior.csv')
similarity_df.to_csv(OUTPUT_PATH + 'User_Correlations_Behavior.csv')
similarity_df

Unnamed: 0,15,30,311,452,468,509,547,564,624,73
15,1.0,0.395367,0.305552,0.230556,0.43494,0.469956,0.123855,0.006502,0.267311,0.462184
30,0.395367,1.0,-0.186997,0.140313,0.102723,0.535891,0.330386,-0.154949,-0.122837,0.122264
311,0.305552,-0.186997,1.0,0.746033,0.344309,0.238744,-0.013878,-0.011111,-0.016278,0.513114
452,0.230556,0.140313,0.746033,1.0,0.807781,0.453188,0.145556,-0.534522,0.537484,0.449013
468,0.43494,0.102723,0.344309,0.807781,1.0,0.595241,0.606714,-0.090911,0.676868,0.500932
509,0.469956,0.535891,0.238744,0.453188,0.595241,1.0,0.734303,-0.204034,0.554024,0.511659
547,0.123855,0.330386,-0.013878,0.145556,0.606714,0.734303,1.0,0.344611,0.436309,-0.072267
564,0.006502,-0.154949,-0.011111,-0.534522,-0.090911,-0.204034,0.344611,1.0,-0.42361,-0.440686
624,0.267311,-0.122837,-0.016278,0.537484,0.676868,0.554024,0.436309,-0.42361,1.0,0.501961
73,0.462184,0.122264,0.513114,0.449013,0.500932,0.511659,-0.072267,-0.440686,0.501961,1.0


In [51]:
def get_rated_user_for_a_movie(ratings_df: pd.DataFrame, movie: str):
    return ratings_df.loc[movie, :].dropna().index.values

In [52]:
def get_top_neighbors(similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int):
    return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()

In [53]:
def subtract_bias(rating: float, mean_rating: float):
    return rating - mean_rating


def get_neighbor_rating_without_bias_per_movie(
    ratings_df: pd.DataFrame, user: str, movie: str
):
    """Substract the rating of a user from the mean rating of that user to eliminate bias"""
    mean_rating = ratings_df[user].mean()
    rating = ratings_df.loc[movie, user]
    return subtract_bias(rating, mean_rating)
    
def get_ratings_of_neighbors(ratings_df: pd.DataFrame, neighbors: list, movie: str):
    """Get the ratings of all neighbors after adjusting for biases"""
    return [
        get_neighbor_rating_without_bias_per_movie(ratings_df, neighbor, movie)
        for neighbor in neighbors
    ]

In [54]:
def get_weighted_average_rating_of_neighbors(ratings: list, neighbor_distance: list):
    weighted_sum = np.array(ratings).dot(np.array(neighbor_distance))
    abs_neigbor_distance = np.abs(neighbor_distance)
    return weighted_sum / np.sum(abs_neigbor_distance)

In [55]:
def ger_user_rating(ratings_df: pd.DataFrame, user: str, avg_neighbor_rating: float):
    user_avg_rating = ratings_df[user].mean()
    return round(user_avg_rating + avg_neighbor_rating, 2)

In [56]:
def predict_rating(
    df: pd.DataFrame,
    similarity_df: pd.DataFrame,
    user: str,
    movie: str,
    n_neighbors: int = 2,
):
    """Predict the rating of a user for a movie based on the ratings of neighbors"""
    ratings_df = df.copy()

    rated_users = get_rated_user_for_a_movie(ratings_df, movie)

    top_neighbors_distance = get_top_neighbors(
        similarity_df, user1, rated_users, n_neighbors
    )
    neighbors, distance = top_neighbors_distance.keys(), top_neighbors_distance.values()

    print(f"Top {n_neighbors} neighbors of user {user}, {movie}: {list(neighbors)}")

    ratings = get_ratings_of_neighbors(ratings_df, neighbors, movie)
    avg_neighbor_rating = get_weighted_average_rating_of_neighbors(
        ratings, list(distance)
    )

    return ger_user_rating(ratings_df, user, avg_neighbor_rating)

In [57]:
full_ratings = ratings_df.copy()

for user, movies in full_ratings.iteritems():
    for movie in movies.keys():
        if np.isnan(full_ratings.loc[movie, user]):
            full_ratings.loc[movie, user] = predict_rating(
                ratings_df, similarity_df, user, movie
            )

NameError: name 'user1' is not defined