# Project

### Import packages

On commence par installer et importer les packages qui nous seront nécessaires lors de l'implémentation du projet.

In [1]:
%pip install numpy
%pip install pandas
%pip install implicit

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import re

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from implicit.als import AlternatingLeastSquares
import scipy.sparse as sparse

# Tasks

## Task 1: Data Collection and Preprocessing

In [3]:
# Function to extract the year from title
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return match.group(1)
    return None


# Function to remove the year from title
def remove_year(title):
    return re.sub(r'\s*\(\d{4}\)', '', title).strip()

### Datasets initialization

On commence par charger toute la donnée dans différents dataframes. Cette procédure est détaillée dans les cellules suivantes.

In [4]:
# Load movies
movies_df = pd.read_csv("data/movies.dat", names=["MovieID", "Title", "Genres"], delimiter="::", encoding="latin-1")

# We separate the title from the year 
movies_df["Year"] = movies_df["Title"].apply(extract_year)
movies_df["Title"] = movies_df["Title"].apply(remove_year)

movies_df

  movies_df = pd.read_csv("data/movies.dat", names=["MovieID", "Title", "Genres"], delimiter="::", encoding="latin-1")


Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents,Comedy,2000
3879,3949,Requiem for a Dream,Drama,2000
3880,3950,Tigerland,Drama,2000
3881,3951,Two Family House,Drama,2000


In [5]:
# Load ratings
ratings_df = pd.read_csv("data/ratings.dat", names=["UserID", "MovieID", "Rating", "Timestamp"], delimiter="::", encoding="latin-1")

ratings_df

  ratings_df = pd.read_csv("data/ratings.dat", names=["UserID", "MovieID", "Rating", "Timestamp"], delimiter="::", encoding="latin-1")


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
# Load users
users_df = pd.read_csv("data/users.dat", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"], delimiter="::", encoding="latin-1")

users_df

  users_df = pd.read_csv("data/users.dat", names=["UserID", "Gender", "Age", "Occupation", "Zip-code"], delimiter="::", encoding="latin-1")


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [7]:
# Load title.basics dataset
movies2_df = pd.read_csv("data/title.basics.tsv", delimiter='\t')

movies2_df

  movies2_df = pd.read_csv("data/title.basics.tsv", delimiter='\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10904341,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10904342,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10904343,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10904344,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [8]:
# Load staff dataset (actors, producers, etc...)
staff_df = pd.read_csv("data/name.basics.tsv", delimiter='\t')

# Filter only those who contributed to at least one movie
staff_df = staff_df.loc[staff_df["knownForTitles"] != r"\N"]

staff_df

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0027125"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0069467"
...,...,...,...,...,...,...
13614331,nm9993709,Lu Bevins,\N,\N,"producer,director,writer","tt17717854,tt11772904,tt11772812,tt11697102"
13614335,nm9993713,Sambit Mishra,\N,\N,"writer,producer","tt20319332,tt27191658,tt10709066,tt15134202"
13614336,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department","tt11657662,tt14069590,tt2455546"
13614338,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744


On merge les dataframes **movies_df** et **movies2_df** afin d'obtenir un dataframe comportant toutes les données connues pour chaque film. Nous nous occuperons plus tard de filtrer ces données pour ne garder que les plus pertinentes.

In [9]:
# Merging datasets to include movie ratings, titles, genres, cast, year, adult category label, runtime etc...
matched_movies = pd.concat([
    movies_df.merge(movies2_df, how='inner', left_on=['Year', 'Title'], right_on=['startYear', 'primaryTitle']),
    movies_df.merge(movies2_df, how='inner', left_on=['Year', 'Title'], right_on=['startYear', 'originalTitle'])
])

matched_movies.drop_duplicates(inplace=True)
matched_movies.reset_index(drop=True, inplace=True)

matched_movies

Unnamed: 0,MovieID,Title,Genres,Year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy"
1,2,Jumanji,Adventure|Children's|Fantasy,1995,tt0113497,movie,Jumanji,Jumanji,0,1995,\N,104,"Adventure,Comedy,Family"
2,3,Grumpier Old Men,Comedy|Romance,1995,tt0113228,movie,Grumpier Old Men,Grumpier Old Men,0,1995,\N,101,"Comedy,Romance"
3,4,Waiting to Exhale,Comedy|Drama,1995,tt0114885,movie,Waiting to Exhale,Waiting to Exhale,0,1995,\N,124,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II,Comedy,1995,tt0113041,movie,Father of the Bride Part II,Father of the Bride Part II,0,1995,\N,106,"Comedy,Family,Romance"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2870,3434,Death Wish V: The Face of Death,Action|Drama,1994,tt0109578,movie,Death Wish: The Face of Death,Death Wish V: The Face of Death,0,1994,\N,95,"Action,Crime,Drama"
2871,3509,Black and White,Drama,1999,tt0165643,movie,Black & White,Black and White,0,1999,\N,98,"Crime,Drama,Music"
2872,3707,Nine 1/2 Weeks,Drama,1986,tt0091635,movie,9½ Weeks,Nine 1/2 Weeks,0,1986,\N,117,"Drama,Romance"
2873,3751,Chicken Run,Animation|Children's|Comedy,2000,tt0276856,videoGame,Chicken Run: The Video Game,Chicken Run,0,2000,\N,\N,"Action,Comedy,Family"


## Task 2: Feature Engineering

Dans un premier temps, je décide de ne conserver que les films car il s'agit du sujet de ce projet

In [10]:
# Keeping only movie since we want to make a movie recommender system
matched_movies = matched_movies[matched_movies["titleType"] == "movie"]

matched_movies.reset_index(drop=True, inplace=True)

matched_movies

Unnamed: 0,MovieID,Title,Genres,Year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy"
1,2,Jumanji,Adventure|Children's|Fantasy,1995,tt0113497,movie,Jumanji,Jumanji,0,1995,\N,104,"Adventure,Comedy,Family"
2,3,Grumpier Old Men,Comedy|Romance,1995,tt0113228,movie,Grumpier Old Men,Grumpier Old Men,0,1995,\N,101,"Comedy,Romance"
3,4,Waiting to Exhale,Comedy|Drama,1995,tt0114885,movie,Waiting to Exhale,Waiting to Exhale,0,1995,\N,124,"Comedy,Drama,Romance"
4,5,Father of the Bride Part II,Comedy,1995,tt0113041,movie,Father of the Bride Part II,Father of the Bride Part II,0,1995,\N,106,"Comedy,Family,Romance"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2424,3420,...And Justice for All,Drama|Thriller,1979,tt0078718,movie,And Justice for All,...And Justice for All,0,1979,\N,119,"Crime,Drama,Thriller"
2425,3434,Death Wish V: The Face of Death,Action|Drama,1994,tt0109578,movie,Death Wish: The Face of Death,Death Wish V: The Face of Death,0,1994,\N,95,"Action,Crime,Drama"
2426,3509,Black and White,Drama,1999,tt0165643,movie,Black & White,Black and White,0,1999,\N,98,"Crime,Drama,Music"
2427,3707,Nine 1/2 Weeks,Drama,1986,tt0091635,movie,9½ Weeks,Nine 1/2 Weeks,0,1986,\N,117,"Drama,Romance"


Puis nous ajoutons à chaque film une liste des ids du staff qui a participé à sa réalisation

In [11]:
# Link casts to the movies
staff_df['knownForTitles'] = staff_df['knownForTitles'].str.split(',')
staff_df = staff_df.explode('knownForTitles')

matched_movies = matched_movies.merge(staff_df, left_on='tconst', right_on='knownForTitles', how='left')

matched_movies

Unnamed: 0,MovieID,Title,Genres,Year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0000741,Tim Allen,1953,\N,"actor,producer,miscellaneous",tt0114709
1,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0001652,John Ratzenberger,1947,\N,"actor,director,producer",tt0114709
2,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0001815,Jim Varney,1949,2000,"actor,writer,soundtrack",tt0114709
3,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0005124,John Lasseter,1957,\N,"producer,writer,miscellaneous",tt0114709
4,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0005271,Randy Newman,1943,\N,"music_artist,music_department,composer",tt0114709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213910,3854,Aimée & Jaguar,Drama|Romance,1999,tt0130444,movie,Aimee & Jaguar,Aimée & Jaguar,0,1999,\N,125,"Biography,Drama,Romance",nm0941945,Wolfgang Woytt,\N,\N,actor,tt0130444
213911,3854,Aimée & Jaguar,Drama|Romance,1999,tt0130444,movie,Aimee & Jaguar,Aimée & Jaguar,0,1999,\N,125,"Biography,Drama,Romance",nm0952883,Sarath Zander,\N,\N,art_department,tt0130444
213912,3854,Aimée & Jaguar,Drama|Romance,1999,tt0130444,movie,Aimee & Jaguar,Aimée & Jaguar,0,1999,\N,125,"Biography,Drama,Romance",nm0956823,Frank Zimmermann,1964,\N,"actor,transportation_department",tt0130444
213913,3854,Aimée & Jaguar,Drama|Romance,1999,tt0130444,movie,Aimee & Jaguar,Aimée & Jaguar,0,1999,\N,125,"Biography,Drama,Romance",nm3610857,Rolf Holtappel,\N,\N,art_department,tt0130444


In [12]:
cast_df = matched_movies.groupby('MovieID')['nconst'].apply(lambda x: ','.join(x.dropna().unique())).reset_index()

cast_df

Unnamed: 0,MovieID,nconst
0,1,"nm0000741,nm0001652,nm0001815,nm0005124,nm0005..."
1,2,"nm0001372,nm0001564,nm0002123,nm0003742,nm0036..."
2,3,"nm0025908,nm0075828,nm0114089,nm0117321,nm0132..."
3,4,"nm0001365,nm0004892,nm0005375,nm0007478,nm0021..."
4,5,"nm0003028,nm0015233,nm0027403,nm0029416,nm0030..."
...,...,...
2406,3947,"nm0001180,nm0005982,nm0034873,nm0040742,nm0041..."
2407,3948,"nm0001100,nm0001632,nm0006633,nm0009016,nm0034..."
2408,3949,"nm0000124,nm0000995,nm0001467,nm0002486,nm0003..."
2409,3950,"nm0036520,nm0038614,nm0115938,nm0156214,nm0186..."


In [13]:
cast_df.rename(columns={'nconst': 'cast'}, inplace=True)

# Fusionner avec full_movies_df pour ajouter la colonne 'cast'
matched_movies = pd.merge(matched_movies, cast_df, on='MovieID', how='left')

In [14]:
matched_movies

Unnamed: 0,MovieID,Title,Genres,Year,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,cast
0,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0000741,Tim Allen,1953,\N,"actor,producer,miscellaneous",tt0114709,"nm0000741,nm0001652,nm0001815,nm0005124,nm0005..."
1,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0001652,John Ratzenberger,1947,\N,"actor,director,producer",tt0114709,"nm0000741,nm0001652,nm0001815,nm0005124,nm0005..."
2,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0001815,Jim Varney,1949,2000,"actor,writer,soundtrack",tt0114709,"nm0000741,nm0001652,nm0001815,nm0005124,nm0005..."
3,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0005124,John Lasseter,1957,\N,"producer,writer,miscellaneous",tt0114709,"nm0000741,nm0001652,nm0001815,nm0005124,nm0005..."
4,1,Toy Story,Animation|Children's|Comedy,1995,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",nm0005271,Randy Newman,1943,\N,"music_artist,music_department,composer",tt0114709,"nm0000741,nm0001652,nm0001815,nm0005124,nm0005..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213910,3854,Aimée & Jaguar,Drama|Romance,1999,tt0130444,movie,Aimee & Jaguar,Aimée & Jaguar,0,1999,\N,125,"Biography,Drama,Romance",nm0941945,Wolfgang Woytt,\N,\N,actor,tt0130444,"nm0021256,nm0025974,nm0062383,nm0070605,nm0076..."
213911,3854,Aimée & Jaguar,Drama|Romance,1999,tt0130444,movie,Aimee & Jaguar,Aimée & Jaguar,0,1999,\N,125,"Biography,Drama,Romance",nm0952883,Sarath Zander,\N,\N,art_department,tt0130444,"nm0021256,nm0025974,nm0062383,nm0070605,nm0076..."
213912,3854,Aimée & Jaguar,Drama|Romance,1999,tt0130444,movie,Aimee & Jaguar,Aimée & Jaguar,0,1999,\N,125,"Biography,Drama,Romance",nm0956823,Frank Zimmermann,1964,\N,"actor,transportation_department",tt0130444,"nm0021256,nm0025974,nm0062383,nm0070605,nm0076..."
213913,3854,Aimée & Jaguar,Drama|Romance,1999,tt0130444,movie,Aimee & Jaguar,Aimée & Jaguar,0,1999,\N,125,"Biography,Drama,Romance",nm3610857,Rolf Holtappel,\N,\N,art_department,tt0130444,"nm0021256,nm0025974,nm0062383,nm0070605,nm0076..."


On vérifie s'il existe des films pour adulte dans le dataframe résultant.

In [15]:
# Checking if there is any adult movies in the merged dataset
len(matched_movies.loc[matched_movies["isAdult"] == 1])

0

On se rend compte qu'aucun film n'est classé comme étant pour adulte. On en déduit que la colonne **isAdult** n'est pas nécessaire dans les données.

On supprime donc les colonnes non nécessaires.

In [16]:
# Drop unecessary columns
cols = ["tconst", "titleType", "primaryTitle", "originalTitle", "isAdult", "startYear", "endYear", "genres", "primaryName", "birthYear", "deathYear", "primaryProfession", "knownForTitles", "nconst"]
filtered_movie_features = matched_movies.drop(cols, axis=1)

filtered_movie_features.drop_duplicates(subset="MovieID", inplace=True)

filtered_movie_features

Unnamed: 0,MovieID,Title,Genres,Year,runtimeMinutes,cast
0,1,Toy Story,Animation|Children's|Comedy,1995,81,"nm0000741,nm0001652,nm0001815,nm0005124,nm0005..."
135,2,Jumanji,Adventure|Children's|Fantasy,1995,104,"nm0001372,nm0001564,nm0002123,nm0003742,nm0036..."
215,3,Grumpier Old Men,Comedy|Romance,1995,101,"nm0025908,nm0075828,nm0114089,nm0117321,nm0132..."
288,4,Waiting to Exhale,Comedy|Drama,1995,124,"nm0001365,nm0004892,nm0005375,nm0007478,nm0021..."
432,5,Father of the Bride Part II,Comedy,1995,106,"nm0003028,nm0015233,nm0027403,nm0029416,nm0030..."
...,...,...,...,...,...,...
213465,3091,Kagemusha,Drama|War,1980,162,"nm0000041,nm0040880,nm0157121,nm0234393,nm0247..."
213533,3420,...And Justice for All,Drama|Thriller,1979,119,"nm0001441,nm0007150,nm0032640,nm0036663,nm0046..."
213603,3434,Death Wish V: The Face of Death,Action|Drama,1994,95,"nm0026413,nm0104820,nm0116653,nm0123935,nm0138..."
213700,3707,Nine 1/2 Weeks,Drama,1986,117,"nm0004841,nm0006129,nm0009620,nm0018726,nm0019..."


On crée la user-item interaction matrix qui nous servira à faire du collaborative filtering.

In [17]:
# Compute the sparse raking matrix (one line per user, one column per movie)
# Start by filtering ratings to existing movies
filtered_ratings_df = ratings_df[ratings_df["MovieID"].isin(filtered_movie_features["MovieID"])]

# Create user-item matrix representing the ratings of a movie from each user
user_item_matrix = filtered_ratings_df.pivot(index="UserID", columns="MovieID", values="Rating").fillna(0)

user_item_matrix

MovieID,1,2,3,4,5,6,7,8,9,10,...,3942,3943,3944,3945,3946,3947,3948,3949,3950,3951
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
print(user_item_matrix.shape)

(6040, 2333)


On combile les données des deux utilisateurs pour obtenir la donnée représentative du couple.

In [19]:
# We will compute the average rating for each movie from the couple's ratings
def compute_couple_data(user_id_1, user_id_2):
    ratings_1 = user_item_matrix.loc[user_id_1]
    ratings_2 = user_item_matrix.loc[user_id_2]
    ratings = pd.concat([ratings_1, ratings_2], axis=1)
    ratings["mean"] = ratings.mean(axis=1)

    return ratings["mean"]

# Test the compute_couple_data function with two random users
compute_couple_data(1, 6040)


MovieID
1       4.0
2       0.0
3       0.0
4       0.0
5       0.0
       ... 
3947    0.0
3948    0.0
3949    0.0
3950    0.0
3951    0.0
Name: mean, Length: 2333, dtype: float64

In [20]:
# Test: take the 2 most rating users and recommend them 5 movies
user_ranking_df = ratings_df["UserID"].value_counts()
ids = user_ranking_df.head(2).index.to_list()

result = compute_couple_data(ids[0], ids[1])

# Recommend the 5 best rated movies
result.sort_values(ascending=False).head(5)

MovieID
2020    5.0
1029    5.0
3724    5.0
2664    5.0
1012    5.0
Name: mean, dtype: float64

## Task 3: Model Development

Pour faire les prédictions sur les notes attribuées aux utilisateurs pour chaque film, nous implémenterons l'algorithme Alternating Least Squares (ALS).

Les raisons sont l'efficacité computationnelle de cet algorithme, le fait que nous avons un dataset suffisament grand et que notre user-item matrix peut être factorisée en deux matrices. De plus, l'algorithme ALS se distribue très facilement avec Spark.

Toutefois, en raison de la faible taille du dataset sur lequel nous travaillons durant ce projet, nous ne ferons pas de parallélisme.

Avant de commencer à implémenter l'algorithme, j'ai d'abord voulu normaliser les notes dans l'intervalle [0;1] afin d'accélérer la convergence du modèle

In [21]:
# Function to normalize ratings before executing ALS algorith
def normalize_ratings(ratings):
    scaler = MinMaxScaler((0, 1))
    normalized_ratings = ratings.copy()

    for column in normalized_ratings.columns:
        non_zero_mask = normalized_ratings[column] > 0
        normalized_ratings.loc[non_zero_mask, column] = scaler.fit_transform(normalized_ratings.loc[non_zero_mask, column].values.reshape(-1, 1)).flatten()
    return normalized_ratings, scaler

Puis je définis le modèle. Pou ce projet, j'ai décidé d'utiliser la valeur par défaut de l'hyperparamètre factors, qui représente le nombre de features pour chaque film et chaque user. Il est par défaut fixé à 64.

In [22]:
model = AlternatingLeastSquares(factors=64, regularization=0.01, iterations=50, calculate_training_loss=True)
normalized_ratings, scaler = normalize_ratings(user_item_matrix)
sparse_matrix = sparse.csr_matrix(normalized_ratings.values)

# train the model on a sparse matrix of user/item/confidence weights
model.fit(sparse_matrix)


  check_blas_config()


  0%|          | 0/50 [00:00<?, ?it/s]

In [23]:
print(f"Number of users in the model: {model.user_factors.shape[0]}")
print(f"Number of users in user_mapping: {len(user_item_matrix.index)}")

print(f"Number of movies in the model: {model.item_factors.shape[0]}")
print(f"Number of movies in item_mapping: {len(user_item_matrix.columns)}")

Number of users in the model: 6040
Number of users in user_mapping: 6040
Number of movies in the model: 2333
Number of movies in item_mapping: 2333


## Task 4: Recommendation Algorithm

In [24]:
model.item_factors.shape

(2333, 64)

In [25]:
model.user_factors.shape

(6040, 64)

On définit une fonction pour faire des recommandations sur un couple (on fait la moyenne de leurs facteurs pou cela). Enfin, on teste la fonction avec le couple des users ayant noté le plus de films.

In [26]:
def recommend_movies(users, als, user_mapping, item_mapping, movies, num_recommendations):
    users_indexes = [user_mapping.get_loc(users[0]), user_mapping.get_loc(users[1])]
    users_factors = [als.user_factors[users_indexes[0]], als.user_factors[users_indexes[1]]]
    
    # Compute factors for the couple of users (simple mean)
    combined_factors = (users_factors[0] + users_factors[1]) / 2
    
    scores = als.item_factors.dot(combined_factors)

    # Get the best rated movies
    top_items = np.argsort(scores)[::-1][:num_recommendations]
    
    # Get corresponding MovieIDs
    recommended_movie_ids = item_mapping[top_items].tolist()
    recommended_movies = movies[movies['MovieID'].isin(recommended_movie_ids)].copy()
    
    # Scale the scores back to the original discrete [0, 5] range
    predicted_ratings = (scores[top_items] * 5).clip(0, 5)
    recommended_movies.loc[:, 'PredictedRating'] = predicted_ratings
    
    return recommended_movies.sort_values('PredictedRating', ascending=False)



In [27]:
# Use the new model for recommendations
recommendations = recommend_movies(ids, model, user_item_matrix.index, user_item_matrix.columns, filtered_movie_features, 1000)
recommendations

Unnamed: 0,MovieID,Title,Genres,Year,runtimeMinutes,cast,PredictedRating
135,2,Jumanji,Adventure|Children's|Fantasy,1995,104,"nm0001372,nm0001564,nm0002123,nm0003742,nm0036...",5.000000
34520,523,Ruby in Paradise,Drama,1993,114,"nm0016746,nm0050704,nm0055521,nm0075375,nm0105...",5.000000
32326,485,Last Action Hero,Action|Comedy,1993,130,"nm0000216,nm0000719,nm0001097,nm0001532,nm0004...",5.000000
32791,490,Malice,Thriller,1993,107,"nm0000887,nm0001198,nm0002578,nm0004448,nm0017...",5.000000
32846,492,Manhattan Murder Mystery,Comedy|Mystery,1993,104,"nm0005688,nm0011742,nm0012178,nm0040147,nm0061...",5.000000
...,...,...,...,...,...,...,...
213345,2670,"Run Silent, Run Deep",War,1958,93,"nm0063459,nm0186576,nm0223290,nm0289375,nm0310...",2.515797
213362,2782,Pit and the Pendulum,Horror,1961,80,"nm0013151,nm0026035,nm0323073,nm0449734,nm0559...",2.508554
213369,2788,And Now for Something Completely Different,Comedy,1971,88,"nm0057733,nm0095665,nm0111358,nm0143513,nm0166...",2.507338
213533,3420,...And Justice for All,Drama|Thriller,1979,119,"nm0001441,nm0007150,nm0032640,nm0036663,nm0046...",2.506653


## Task 5: Evaluation

Avec l'algorithme Alternating Least Square (ALS), nous voulons minimiser deux fonctions d'erreur afin de nous approcher le plus possible des vraies notes contenues dans la matrice user-item. \
C'est pour cela que nous utiliserons la Mean Square Error comme métrique d'évaluation des performances de notre système de recommandations.

In [28]:
# Split the data into training and test sets
train_data, test_data = train_test_split(filtered_ratings_df, test_size=0.2, random_state=23, shuffle=True)

In [29]:
len(train_data)

557565

In [30]:
len(test_data)

139392

On crée ensuite les user-item matrixes pour les deux sets (entraînement et test)

In [31]:
# Create user-item matrixes for both training and testing sets
train_user_item_matrix = train_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)
test_user_item_matrix = test_data.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)

# Train the model on the training set
train_model = AlternatingLeastSquares(factors=64, regularization=0.01, iterations=50, calculate_training_loss=True)
normalized_ratings, scaler = normalize_ratings(train_user_item_matrix)
sparse_train_matrix = sparse.csr_matrix(normalized_ratings.values)

train_model.fit(sparse_train_matrix)


  0%|          | 0/50 [00:00<?, ?it/s]

In [32]:
def make_predictions(user_id, als, users, items):
    user_idx = users.get_loc(user_id)
    user_factors = als.user_factors[user_idx]
    scores = als.item_factors.dot(user_factors)
    predicted_ratings = pd.Series(scores, index=items).clip(0, 5)

    return predicted_ratings

On génère des prédictions sur le test set

In [33]:
# Generate predictions for the test set
predictions = pd.DataFrame(index=test_user_item_matrix.index, columns=test_user_item_matrix.columns)
common_users = test_user_item_matrix.index.intersection(user_item_matrix.index)

for user_id in common_users:
    predictions.loc[user_id] = make_predictions(user_id, train_model, train_user_item_matrix.index, train_user_item_matrix.columns)

true_ratings = test_user_item_matrix.values.flatten()
predicted_ratings = predictions.values.flatten()

# Remove nan
mask = ~np.isnan(true_ratings) & ~pd.isna(predicted_ratings)
true_ratings = true_ratings[mask]
predicted_ratings = predicted_ratings[mask]


Enfin, on calcule la Mean Squared Error (MSE) sur les prédictions

In [34]:
# Calculate MSE
mse = mean_squared_error(true_ratings, predicted_ratings)

print(f'MSE: {mse}')

MSE: 0.14230376021848132
