In [1]:
import pandas as pd    


In [2]:
#Charger les fichiers

# Ratings (u.data)

cols_ratings = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv("ml-100k/u.data", sep='\t', names=cols_ratings)
# Genres (u.genre)
genres = []
with open("ml-100k/u.genre", "r") as f:
    for line in f:
        if line.strip():
            g, idx = line.strip().split('|')
            genres.append(g)
# Movies (u.item)
cols_items = ['movie_id','title','release_date','video_release_date','imdb_url'] + genres
movies = pd.read_csv("ml-100k/u.item", sep='|', names=cols_items, encoding="latin-1")

# Users (u.user)
cols_users = ['user_id','age','gender','occupation','zip_code']
users = pd.read_csv("ml-100k/u.user", sep='|', names=cols_users)

In [3]:
print("Liste des genres détectés :", genres)

Liste des genres détectés : ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [4]:
genre_cols_in_movies = [c for c in movies.columns if c in genres]
print("Genres trouvés dans movies (count) :", len(genre_cols_in_movies))
print("Extrait des colonnes genres :", genre_cols_in_movies[:10])


Genres trouvés dans movies (count) : 19
Extrait des colonnes genres : ['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy']


In [5]:
print("Dtypes des colonnes genres (avant conversion) :")
print(movies[genres].dtypes)   # 

Dtypes des colonnes genres (avant conversion) :
unknown        int64
Action         int64
Adventure      int64
Animation      int64
Children's     int64
Comedy         int64
Crime          int64
Documentary    int64
Drama          int64
Fantasy        int64
Film-Noir      int64
Horror         int64
Musical        int64
Mystery        int64
Romance        int64
Sci-Fi         int64
Thriller       int64
War            int64
Western        int64
dtype: object


In [6]:
movies['genres_list'] = movies[genres].apply(lambda row: list(row[row == 1].index), axis=1)

# Quelques vérifications rapides :
print("Exemples de movies avec leur genres_list :")
print(movies[['movie_id','title','genres_list']].head(8).to_string(index=False))


Exemples de movies avec leur genres_list :
 movie_id                                                title                     genres_list
        1                                     Toy Story (1995) [Animation, Children's, Comedy]
        2                                     GoldenEye (1995)   [Action, Adventure, Thriller]
        3                                    Four Rooms (1995)                      [Thriller]
        4                                    Get Shorty (1995)         [Action, Comedy, Drama]
        5                                       Copycat (1995)        [Crime, Drama, Thriller]
        6 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)                         [Drama]
        7                                Twelve Monkeys (1995)                 [Drama, Sci-Fi]
        8                                          Babe (1995)     [Children's, Comedy, Drama]


In [7]:
# Vérifier qu'il n'y a pas de films sans genre (si min == 0 -> film sans genre)
num_no_genre = (movies['genres_list'].apply(len) == 0).sum()
print("Nombre de films sans genre détecté :", num_no_genre)
if num_no_genre > 0:
    print("Exemples de films sans genre :")
    print(movies.loc[movies['genres_list'].apply(len) == 0, ['movie_id','title']].head().to_string(index=False))


Nombre de films sans genre détecté : 0


In [8]:
df = ratings.merge(movies, on='movie_id', how='left')
df = df.merge(users, on='user_id', how='left')

print("Vérifier quelques lignes du DataFrame final :")
print(df[['user_id','movie_id','title','genres_list','rating']].head(10).to_string(index=False))


Vérifier quelques lignes du DataFrame final :
 user_id  movie_id                                                                       title                           genres_list  rating
     196       242                                                                Kolya (1996)                              [Comedy]       3
     186       302                                                    L.A. Confidential (1997) [Crime, Film-Noir, Mystery, Thriller]       3
      22       377                                                         Heavyweights (1994)                  [Children's, Comedy]       1
     244        51                                                  Legends of the Fall (1994)        [Drama, Romance, War, Western]       2
     166       346                                                         Jackie Brown (1997)                        [Crime, Drama]       1
     298       474 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)              

In [9]:

# 7) Contrôles supplémentaires sur le df final
print("Shape final:", df.shape)
print("Nombre de valeurs null dans genres_list :", df['genres_list'].isnull().sum())

Shape final: (100000, 32)
Nombre de valeurs null dans genres_list : 0


In [10]:
# Sauvegarder le DataFrame final en CSV
df.to_csv("movieclean.csv", index=False, encoding="utf-8")

print("@ DataFrame sauvegardé en movieclean.csv")


@ DataFrame sauvegardé en movieclean.csv


In [11]:


df = pd.read_csv("movieclean.csv")
print(df.head())


   user_id  movie_id  rating  timestamp                       title  \
0      196       242       3  881250949                Kolya (1996)   
1      186       302       3  891717742    L.A. Confidential (1997)   
2       22       377       1  878887116         Heavyweights (1994)   
3      244        51       2  880606923  Legends of the Fall (1994)   
4      166       346       1  886397596         Jackie Brown (1997)   

  release_date  video_release_date  \
0  24-Jan-1997                 NaN   
1  01-Jan-1997                 NaN   
2  01-Jan-1994                 NaN   
3  01-Jan-1994                 NaN   
4  01-Jan-1997                 NaN   

                                            imdb_url  unknown  Action  ...  \
0    http://us.imdb.com/M/title-exact?Kolya%20(1996)        0       0  ...   
1  http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...        0       0  ...   
2  http://us.imdb.com/M/title-exact?Heavyweights%...        0       0  ...   
3  http://us.imdb.com/M/title-ex

In [12]:
print(df.columns)


Index(['user_id', 'movie_id', 'rating', 'timestamp', 'title', 'release_date',
       'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure',
       'Animation', 'Children's', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western', 'genres_list', 'age', 'gender',
       'occupation', 'zip_code'],
      dtype='object')


In [13]:
cols_to_drop = ['unknown', 'Action', 'Adventure', 'Animation', "Children's",
                'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
                'Sci-Fi', 'Thriller', 'War', 'Western']
df = df.drop(columns=cols_to_drop)
print (df.columns)

Index(['user_id', 'movie_id', 'rating', 'timestamp', 'title', 'release_date',
       'video_release_date', 'imdb_url', 'genres_list', 'age', 'gender',
       'occupation', 'zip_code'],
      dtype='object')


In [14]:
print(df[['title', 'genres_list']].head(10))

                                               title  \
0                                       Kolya (1996)   
1                           L.A. Confidential (1997)   
2                                Heavyweights (1994)   
3                         Legends of the Fall (1994)   
4                                Jackie Brown (1997)   
5  Dr. Strangelove or: How I Learned to Stop Worr...   
6                   Hunt for Red October, The (1990)   
7                            Jungle Book, The (1994)   
8                                      Grease (1978)   
9                     Remains of the Day, The (1993)   

                                     genres_list  
0                                     ['Comedy']  
1  ['Crime', 'Film-Noir', 'Mystery', 'Thriller']  
2                       ["Children's", 'Comedy']  
3         ['Drama', 'Romance', 'War', 'Western']  
4                             ['Crime', 'Drama']  
5                              ['Sci-Fi', 'War']  
6                         

In [15]:
# Vérifie les doublons exacts sur toutes les colonnes
print(df.duplicated().sum())

# Si tu veux supprimer les doublons exacts
df = df.drop_duplicates()


0


In [16]:
# Compte des valeurs manquantes par colonne
print(df.isnull().sum())


user_id                    0
movie_id                   0
rating                     0
timestamp                  0
title                      0
release_date               9
video_release_date    100000
imdb_url                  13
genres_list                0
age                        0
gender                     0
occupation                 0
zip_code                   0
dtype: int64


In [17]:
# 1. Supprimer les lignes avec release_date manquante
df = df.dropna(subset=['release_date'])

# 2. Supprimer les lignes avec imdb_url manquante
df = df.dropna(subset=['imdb_url'])

# 3. Supprimer la colonne video_release_date
df = df.drop(columns=['video_release_date'])

# Vérifier qu'il n'y a plus de valeurs nulles
print(df.isnull().sum())

# Sauvegarder la nouvelle version nettoyée
df.to_csv("movieclean1.csv", index=False, encoding="utf-8")
print("✅ Données nettoyées et sauvegardées dans movieclean1.csv")

user_id         0
movie_id        0
rating          0
timestamp       0
title           0
release_date    0
imdb_url        0
genres_list     0
age             0
gender          0
occupation      0
zip_code        0
dtype: int64
✅ Données nettoyées et sauvegardées dans movieclean1.csv


In [18]:
df = pd.read_csv("movieclean.csv")

# Vérifier
print(df[['user_id', 'movie_id', 'rating']].head())

   user_id  movie_id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1


In [19]:
#construire la matrice user-film
user_movie_matrix = df.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating'
).fillna(0)  # Remplacer les NaN (films non notés) par 0

print(user_movie_matrix.shape)  


(943, 1682)


In [20]:
#Calculer la similarité entre utilisateurs
from sklearn.metrics.pairwise import cosine_similarity

# Calculer la similarité cosinus entre les lignes (utilisateurs)
user_similarity = cosine_similarity(user_movie_matrix)

# Mettre dans un DataFrame pour lisibilité
import numpy as np
user_similarity_df = pd.DataFrame(
    user_similarity, 
    index=user_movie_matrix.index,   # index = user_id
    columns=user_movie_matrix.index  # colonnes = user_id aussi
)

print(user_similarity_df.head())


user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.166931  0.047460  0.064358  0.378475  0.430239  0.440367   
2        0.166931  1.000000  0.110591  0.178121  0.072979  0.245843  0.107328   
3        0.047460  0.110591  1.000000  0.344151  0.021245  0.072415  0.066137   
4        0.064358  0.178121  0.344151  1.000000  0.031804  0.068044  0.091230   
5        0.378475  0.072979  0.021245  0.031804  1.000000  0.237286  0.373600   

user_id       8         9         10   ...       934       935       936  \
user_id                                ...                                 
1        0.319072  0.078138  0.376544  ...  0.369527  0.119482  0.274876   
2        0.103344  0.161048  0.159862  ...  0.156986  0.307942  0.358789   
3        0.083060  0.061040  0.065151  ...  0.031875  0.042753  0.163829   
4        0.188060  0.101284  0.060859  ...  0.052107

In [21]:
def recommend_movies(user_id, user_movie_matrix, user_similarity_df, movie_titles, top_n=5):
    """
    Recommande des films pour un utilisateur donné (user_id) en utilisant User-Based Collaborative Filtering.
    Retourne les movie_id et les titres.
    """

    # Étape 1 : trouver les utilisateurs les plus similaires à l'utilisateur cible
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]
    # [1:] car la première valeur = soi-même (sim=1)

    # Étape 2 : récupérer les notes de l'utilisateur cible
    user_ratings = user_movie_matrix.loc[user_id]

    # Étape 2b : films déjà vus
    seen_movies = user_ratings[user_ratings > 0].index

    # Étape 3 : dictionnaire pour accumuler les scores des films
    movie_scores = {}

    # On prend les 10 plus proches voisins
    for sim_user in similar_users[:10]:
        sim_score = user_similarity_df[user_id][sim_user]
        sim_user_ratings = user_movie_matrix.loc[sim_user]

        for movie, rating in sim_user_ratings.items():
            if movie not in seen_movies and rating > 0:
                if movie not in movie_scores:
                    movie_scores[movie] = 0
                movie_scores[movie] += sim_score * rating

    # Étape 4 : trier les films par score
    recommended_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Étape 5 : retourner (movie_id, titre) pour chaque film recommandé
    return [(movie, movie_titles.get(movie, "Unknown Title")) for movie, score in recommended_movies]


In [22]:
# Construire le dictionnaire movie_id -> title
movie_titles = df[['movie_id', 'title']].drop_duplicates().set_index('movie_id')['title'].to_dict()

# Obtenir les recommandations pour l’utilisateur 1
recommended = recommend_movies(1, user_movie_matrix, user_similarity_df, movie_titles, top_n=5)

print("🎬 Films recommandés :")
for mid, title in recommended:
    print(f"{mid} - {title}")


🎬 Films recommandés :
318 - Schindler's List (1993)
474 - Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
655 - Stand by Me (1986)
423 - E.T. the Extra-Terrestrial (1982)
403 - Batman (1989)


In [50]:

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Charger le dataset nettoyé
df = pd.read_csv("movieclean.csv")

# 1️⃣ Split train/test (80% - 20%)
train, test = train_test_split(df, test_size=0.2, random_state=42)

# 2️⃣ Construire la matrice user-movie avec le TRAIN
user_movie_matrix = train.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating'
).fillna(0)

# 3️⃣ Calculer la similarité entre utilisateurs
user_similarity = cosine_similarity(user_movie_matrix)
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_movie_matrix.index,
    columns=user_movie_matrix.index
)

# 4️⃣ Dictionnaire movie_id -> title
movie_titles = df[['movie_id', 'title']].drop_duplicates().set_index('movie_id')['title'].to_dict()

# 5️⃣ Fonction de recommandation (identique à avant mais adaptée au train)
def recommend_movies(user_id, user_movie_matrix, user_similarity_df, movie_titles, top_n=10):
    if user_id not in user_movie_matrix.index:
        return []  # utilisateur pas présent dans le train
    
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]
    user_ratings = user_movie_matrix.loc[user_id]
    seen_movies = user_ratings[user_ratings > 0].index

    movie_scores = {}
    for sim_user in similar_users[:10]:
        sim_score = user_similarity_df[user_id][sim_user]
        sim_user_ratings = user_movie_matrix.loc[sim_user]

        for movie, rating in sim_user_ratings.items():
            if movie not in seen_movies and rating > 0:
                if movie not in movie_scores:
                    movie_scores[movie] = 0
                movie_scores[movie] += sim_score * rating

    recommended_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [movie for movie, score in recommended_movies]

# 6️⃣ Fonction Precision@K
# Fonction pour calculer Precision@K moyenne sur tous les utilisateurs
def average_precision_at_k(user_movie_matrix, user_similarity_df, movie_titles, test, k=4, top_n=10, threshold=3.0):
    users = test['user_id'].unique()
    precisions = []

    for user_id in users:
        # Générer les recommandations pour cet utilisateur
        recommended = recommend_movies(user_id, user_movie_matrix, user_similarity_df, movie_titles, top_n=top_n)
        
        # Calculer Precision@K pour cet utilisateur
        relevant = test[(test['user_id'] == user_id) & (test['rating'] >= threshold)]['movie_id'].tolist()
        if len(relevant) == 0:
            continue  # ignorer si pas de films pertinents

        hits = set(recommended[:k]).intersection(set(relevant))
        precisions.append(len(hits) / k)
    
    if len(precisions) == 0:
        return 0
    return sum(precisions) / len(precisions)

# Calcul Precision@K moyenne
avg_precision = average_precision_at_k(user_movie_matrix, user_similarity_df, movie_titles, test, k=4, top_n=10, threshold=3.0)
print(f"📊 Average Precision@5 = {avg_precision:.2f}")


📊 Average Precision@5 = 0.34


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-win_amd64.whl size=1294141 sha256=daaa7defbd4ad341409a3096ec858eef5a92ed731770b8e6e0fff29b090811f2
  Stored in directory: c:\users\rania\appdata\local\pip\cache\wheels\4b\3f\df\6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise

RMSE: 0.9542
🎬 Recommandations pour user 1 : ['Boot, Das (1981)', 'Fried Green Tomatoes (1991)', 'Secrets & Lies (1996)', 'Lawrence of Arabia (1962)', 'Butch Cassidy and the Sundance Kid (1969)']


📊 Average Precision@5 = 0.00
