In [4]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score
from scipy.sparse import lil_matrix

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
data = pd.read_parquet('final_df.parquet')

In [3]:
unique_users = data['ClientID'].unique()
unique_items = data['ProductID'].unique()

user_id_map = {id_: idx for idx, id_ in enumerate(unique_users)}
item_id_map = {id_: idx for idx, id_ in enumerate(unique_items)}

reverse_user_map = {v: k for k, v in user_id_map.items()}
reverse_item_map = {v: k for k, v in item_id_map.items()}

In [4]:
interactions_df = data.groupby(["ClientID", "ProductID"]).agg({"Quantity_sold": "sum"}).reset_index()
interactions_df["interaction"] = 1

interactions_df['user_idx'] = interactions_df['ClientID'].map(user_id_map)
interactions_df['item_idx'] = interactions_df['ProductID'].map(item_id_map)

print("Interactions agrégées :")
display(interactions_df.head())

Interactions agrégées :


Unnamed: 0,ClientID,ProductID,Quantity_sold,interaction,user_idx,item_idx
0,39370740138294,1440540189819119258,2,1,200392,5914
1,39370740138294,1830527890021285361,1,1,200392,18677
2,39370740138294,3095307643986324846,2,1,200392,1730
3,41168361649132,4786639895536460836,3,1,92951,208
4,118552773044620,7951360260357319889,2,1,247522,1837


In [5]:
user_features_df = data.groupby("ClientID").first().reset_index()[
    ["ClientID", "Age", "ClientGender", "ClientSegment", "ClientCountry"]
]
user_features_df['user_idx'] = user_features_df['ClientID'].map(user_id_map)

item_features_df = data.groupby("ProductID").first().reset_index()[
    ["ProductID", "Category", "FamilyLevel1", "FamilyLevel2", "Universe", "Brand", "AveragePrice"]
]
item_features_df['item_idx'] = item_features_df['ProductID'].map(item_id_map)

print("Features utilisateurs :")
display(user_features_df.head())
print("\nFeatures produits :")
display(item_features_df.head())

Features utilisateurs :


Unnamed: 0,ClientID,Age,ClientGender,ClientSegment,ClientCountry,user_idx
0,39370740138294,22.0,F,LOYAL,GBR,200392
1,41168361649132,,F,LOYAL,FRA,92951
2,118552773044620,52.0,F,LOYAL,USA,247522
3,151055224517439,37.0,F,LOYAL,USA,293765
4,183937473412242,,F,LOYAL,AUS,157566



Features produits :


Unnamed: 0,ProductID,Category,FamilyLevel1,FamilyLevel2,Universe,Brand,item_idx
0,72931364288678,Hockey,Helmet,Bauer RE-AKT,Men,Bauer,17802
1,513428984548409,Handball,Shoes,Mizuno Wave Mirage,Men,Mizuno,7590
2,793119236319256,Tennis,Ball,Wilson US Open,Women,Wilson,12788
3,1406428649545374,Hockey,Helmet,CCM Fitlite,Men,CCM,2684
4,2765609736887881,Cycling,Helmet,Bell Super 3R MIPS,Women,Bell,11880


In [6]:
dataset = Dataset()

user_ids = user_features_df["ClientID"].astype(str).tolist()
item_ids = item_features_df["ProductID"].astype(str).tolist()

user_feature_list = []
for col in ["Age", "ClientGender", "ClientSegment", "ClientCountry"]:
    user_feature_list += [f"{col}:{val}" for val in user_features_df[col].unique() if pd.notnull(val)]

item_feature_list = []
for col in ["Category", "FamilyLevel1", "FamilyLevel2", "Universe", "Brand"]:
    item_feature_list += [f"{col}:{val}" for val in item_features_df[col].unique() if pd.notnull(val)]

dataset.fit(
    users=user_ids,
    items=item_ids,
    user_features=user_feature_list,
    item_features=item_feature_list
)

In [7]:
# Cellule 5 : Création du dataset LightFM
dataset = Dataset()

user_features = []
for col in ["Age", "ClientGender", "ClientSegment", "ClientCountry"]:
    unique_vals = user_features_df[col].dropna().unique()
    user_features.extend([f"{col}_{val}" for val in unique_vals])

item_features = []
for col in ["Category", "FamilyLevel1", "FamilyLevel2", "Universe", "Brand"]:
    unique_vals = item_features_df[col].dropna().unique()
    item_features.extend([f"{col}_{val}" for val in unique_vals])

dataset.fit(
    users=range(len(unique_users)),
    items=range(len(unique_items)),
    user_features=user_features,
    item_features=item_features
)

In [8]:
interactions = dataset.build_interactions(
    ((row.user_idx, row.item_idx, row.interaction) 
     for _, row in interactions_df.iterrows())
)[0]

# User features
user_features_dict = {}
for _, row in user_features_df.iterrows():
    features = []
    for col in ["Age", "ClientGender", "ClientSegment", "ClientCountry"]:
        if pd.notnull(row[col]):
            features.append(f"{col}_{row[col]}")
    user_features_dict[row.user_idx] = features

user_features_matrix = dataset.build_user_features(
    ((idx, feats) for idx, feats in user_features_dict.items()),
    normalize=False
)

# Item features
item_features_dict = {}
for _, row in item_features_df.iterrows():
    features = []
    for col in ["Category", "FamilyLevel1", "FamilyLevel2", "Universe", "Brand"]:
        if pd.notnull(row[col]):
            features.append(f"{col}_{row[col]}")
    item_features_dict[row.item_idx] = features

item_features_matrix = dataset.build_item_features(
    ((idx, feats) for idx, feats in item_features_dict.items()),
    normalize=False
)

print("Dimensions de la matrice d'interactions :", interactions.shape)
print("Dimensions de la matrice user features:", user_features_matrix.shape)
print("Dimensions de la matrice item features:", item_features_matrix.shape)

Dimensions de la matrice d'interactions : (298164, 20638)
Dimensions de la matrice user features: (298164, 298251)
Dimensions de la matrice item features: (20638, 20856)


In [9]:
model = LightFM(loss='warp', random_state=42)

model.fit(
    interactions,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    epochs=20,
    num_threads=4
)

<lightfm.lightfm.LightFM at 0x7fd1d6d94940>

In [10]:
precisions = precision_at_k(
    model,
    interactions,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    k=5
).mean()

auc = auc_score(
    model,
    interactions,
    user_features=user_features_matrix,
    item_features=item_features_matrix,
    num_threads=4
).mean()

print("Precision@5: {:.2f}".format(precisions))
print("AUC Score: {:.2f}".format(auc))

In [None]:
def recommend_for_user(client_id, model, n_items, user_features_matrix, item_features_matrix, 
                      user_id_map, reverse_item_map, user_features_df, data, k=10):
    if client_id in user_id_map:
        # Utilisateur existant
        user_idx = user_id_map[client_id]
        scores = model.predict(
            user_idx,
            np.arange(n_items),
            user_features=user_features_matrix,
            item_features=item_features_matrix
        )
    else:
        user_row = user_features_df[user_features_df["ClientID"] == client_id]
        if not user_row.empty:
            features = []
            for col in ["Age", "ClientGender", "ClientSegment", "ClientCountry"]:
                val = user_row.iloc[0][col]
                if pd.notnull(val):
                    features.append(col + "_" + str(val))
            
            user_features = dataset.build_user_features([(0, features)], normalize=False)
            scores = model.predict(
                0,
                np.arange(n_items),
                user_features=user_features,
                item_features=item_features_matrix
            )
        else:
            popularity = data.groupby('ProductID')['Quantity_sold'].sum()
            return popularity.sort_values(ascending=False).head(k)
    
    top_items = np.argsort(-scores)[:k]
    
    recommendations = pd.DataFrame({
        'ProductID': [reverse_item_map[idx] for idx in top_items],
        'score': scores[top_items]
    })
    
    return recommendations

In [20]:
print("Recommandations pour un utilisateur existant:")
recs = recommend_for_user(
    4388436561084682799, 
    model, 
    len(unique_items),
    user_features_matrix, 
    item_features_matrix,
    user_id_map,
    reverse_item_map,
    user_features_df,
    data
)
display(recs)

print("\nVérification des produits recommandés:")
for prod_id in recs['ProductID'].values[:3]:
    print(f"\nProduit {prod_id}:")
    display(data[data['ProductID'] == prod_id][['ProductID', 'Category', 'Brand']].drop_duplicates())

print("\nRecommandations pour un nouvel utilisateur avec données démo:")
new_user = pd.DataFrame({
    "ClientID": [1234567890],
    "Age": [18],
    "ClientGender": ["F"],
    "ClientSegment": ["LOYAL"],
    "ClientCountry": ["BRA"]
})
user_features_df = pd.concat([user_features_df, new_user], ignore_index=True)

recs = recommend_for_user(
    1234567890, 
    model, 
    len(unique_items),
    user_features_matrix, 
    item_features_matrix,
    user_id_map,
    reverse_item_map,
    user_features_df,
    data
)
display(recs)

Recommandations pour un utilisateur existant:


Unnamed: 0,ProductID,score
0,4552076028035911075,-593.677612
1,3472596138266960403,-593.800476
2,5797895370360313472,-593.866394
3,1611036289651325048,-593.912415
4,8275909935372681760,-593.914795
5,5308606775286696247,-593.976929
6,558212210493498317,-594.044006
7,8193836411099087368,-594.141846
8,7042499222040039915,-594.145264
9,3824742454205225546,-594.205566



Vérification des produits recommandés:

Produit 4552076028035911075:


Unnamed: 0,ProductID,Category,Brand
1588,4552076028035911075,Basketball,Under Armour



Produit 3472596138266960403:


Unnamed: 0,ProductID,Category,Brand
35,3472596138266960403,Rugby,Adidas



Produit 5797895370360313472:


Unnamed: 0,ProductID,Category,Brand
53,5797895370360313472,Rugby,Adidas



Recommandations pour un nouvel utilisateur avec données démo:


Unnamed: 0,ProductID,score
0,4552076028035911075,-663.911621
1,4978781552851483666,-663.994812
2,1611036289651325048,-664.028931
3,235494810757128011,-664.172729
4,8193836411099087368,-664.172974
5,3472596138266960403,-664.346924
6,5797895370360313472,-664.370483
7,1421634154572742367,-664.40509
8,5626151450577313519,-664.44751
9,5308606775286696247,-664.491394
