<a href="https://colab.research.google.com/github/popudrak/DSC-PJATK/blob/main/ABC_LightFM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import json
from lightfm import LightFM
from lightfm.data import Dataset
from tqdm import tqdm

# ------------------------------------------
# Wczytywanie danych
# ------------------------------------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
metadata_raw = pd.read_csv("item_metadata_filtered.csv")

with open("id_mappings.json", "r") as f:
    id_data = json.load(f)

id_map_dict = id_data["item_mapping"]

# Konwersja mapy do DataFrame
id_map_df = pd.DataFrame({
    "parent_asin": list(id_map_dict.keys()),
    "mapped_item_id": list(id_map_dict.values())
})

# Join metadata z mapą ID
metadata_raw['parent_asin'] = metadata_raw['parent_asin'].astype(str)
id_map_df['parent_asin'] = id_map_df['parent_asin'].astype(str)
metadata = metadata_raw.merge(id_map_df, on='parent_asin', how='left')
metadata = metadata[metadata['mapped_item_id'].notnull()].copy()
metadata['item_id'] = metadata['mapped_item_id'].astype(int)

# ------------------------------------------
# Mapowanie użytkowników i produktów na indeksy
# ------------------------------------------
user_ids = train['user_id'].unique()
item_ids = train['item_id'].unique()
user_id_map = {uid: i for i, uid in enumerate(user_ids)}
item_id_map = {iid: i for i, iid in enumerate(item_ids)}
item_id_reverse_map = {i: iid for iid, i in item_id_map.items()}

train['user_idx'] = train['user_id'].map(user_id_map)
train['item_idx'] = train['item_id'].map(item_id_map)

# Mapowanie item_id w metadata na item_idx
metadata['item_idx'] = metadata['item_id'].map(item_id_map)
metadata = metadata[metadata['item_idx'].notnull()]
metadata['item_idx'] = metadata['item_idx'].astype(int)

# ------------------------------------------
# Przygotowanie cech produktów
# ------------------------------------------
metadata['main_category'] = metadata['main_category'].fillna("Unknown")
metadata['category'] = metadata['category'].fillna("Unknown")
metadata['store_missing'] = metadata['store'].isnull()
metadata['store'] = metadata['store'].fillna("Unknown")
metadata['price_available'] = metadata['price'].notnull()
metadata['description_available'] = metadata['description'].apply(lambda x: bool(x and len(x.strip()) > 0))
metadata['price_missing'] = metadata['price'].isnull()
metadata['price_filled'] = metadata['price'].fillna(-1)
metadata['price_bin'] = pd.qcut(metadata.loc[~metadata['price_missing'], 'price'], q=5, labels=False, duplicates='drop')
metadata['rating_bin'] = pd.cut(metadata['average_rating'], bins=[0, 2, 3, 4, 5], labels=False)
metadata['rating_number_missing'] = metadata['rating_number'].isnull()
metadata['rating_number_filled'] = metadata['rating_number'].fillna(-1)
metadata['rating_number_log_bin'] = pd.cut(np.log1p(metadata.loc[~metadata['rating_number_missing'], 'rating_number']), bins=5, labels=False)

# Popularność produktów
item_popularity = train['item_id'].value_counts(normalize=True).to_dict()
metadata['popularity_score'] = metadata['item_id'].map(item_popularity).fillna(1e-6)

if metadata['popularity_score'].nunique() > 1:
    metadata['popularity_bin'] = pd.qcut(metadata['popularity_score'].rank(method='first'), q=5, labels=False, duplicates='drop')
else:
    metadata['popularity_bin'] = 0

add_popularity_feature = True

# ------------------------------------------
# Lista wszystkich cech itemów
# ------------------------------------------
item_features_list = (
    ['category:' + cat for cat in metadata['main_category'].unique()] +
    ['subcategory:' + cat for cat in metadata['category'].unique()] +
    ['store:' + store for store in metadata['store'].unique()] +
    ['has_images', 'price_available', 'description_available', 'price_missing', 'rating_number_missing', 'store_missing'] +
    ['price_bin:' + str(i) for i in range(5)] +
    ['rating_bin:' + str(i) for i in range(4)] +
    ['rating_number_log_bin:' + str(i) for i in range(5)] +
    ['popularity_bin:' + str(i) for i in range(5)]
)

# ------------------------------------------
# Przygotowanie danych dla LightFM
# ------------------------------------------
dataset = Dataset()
dataset.fit(users=user_ids, items=item_ids)
dataset.fit_partial(items=item_ids, item_features=item_features_list)

(interactions, _) = dataset.build_interactions(
    [(row['user_id'], row['item_id'], row['rating']) for _, row in train.iterrows()]
)

# Funkcja generująca cechy itemów
def build_item_features(metadata):
    features = []
    for _, row in metadata.iterrows():
        feats = [
            'category:' + row['main_category'],
            'subcategory:' + row['category'],
            'store:' + row['store']
        ]
        if row['store_missing']:
            feats.append('store_missing')
        if row['has_images']:
            feats.append('has_images')
        if row['price_available']:
            feats.append('price_available')
        if row['description_available']:
            feats.append('description_available')
        if row['price_missing']:
            feats.append('price_missing')
        else:
            feats.append(f'price_bin:{int(row["price_bin"])}')
        feats.append(f'rating_bin:{int(row["rating_bin"])}')
        if row['rating_number_missing']:
            feats.append('rating_number_missing')
        else:
            feats.append(f'rating_number_log_bin:{int(row["rating_number_log_bin"])}')
        feats.append(f'popularity_bin:{int(row["popularity_bin"])}')

        features.append((row['item_id'], feats))
    return features

item_features = dataset.build_item_features(build_item_features(metadata))

# ------------------------------------------
# Trening modelu
# ------------------------------------------
model = LightFM(loss='warp', no_components=256, item_alpha=1e-6, user_alpha=1e-6, random_state=42)
model.fit(interactions, item_features=item_features, epochs=50, num_threads=8)

# ------------------------------------------
# Predykcja
# ------------------------------------------
user_seen_items = train.groupby('user_idx')['item_idx'].apply(set).to_dict()
all_item_indices = np.arange(len(item_id_map))

submission = []
user_embeddings, user_biases = model.get_user_representations()
global_user_embedding = np.mean(user_embeddings, axis=0)
global_user_bias = np.mean(user_biases)
item_embeddings, item_biases = model.get_item_representations(features=item_features)
cold_scores = item_embeddings.dot(global_user_embedding) + item_biases + global_user_bias

item_idx_to_popularity = metadata.set_index('item_idx')['popularity_bin'].to_dict()
low_popularity_items = [i for i in range(len(item_id_map)) if item_idx_to_popularity.get(i, 4) <= 2]

if len(low_popularity_items) >= 10:
    low_pop_scores = cold_scores[low_popularity_items]
    top_indices_in_low_pop = np.argsort(-low_pop_scores)[:10]
    selected_item_idxs = [low_popularity_items[i] for i in top_indices_in_low_pop]
else:
    selected_item_idxs = np.argsort(-cold_scores)[:10]

cold_top_items = [item_id_reverse_map[i] for i in selected_item_idxs]
cold_items_str = ' '.join(map(str, cold_top_items))

for uid in tqdm(test['user_id'].values):
    if uid not in user_id_map:
        submission.append((uid, cold_items_str))
        continue

    uidx = user_id_map[uid]
    seen = user_seen_items.get(uidx, set())
    user_ids_array = np.repeat(uidx, len(all_item_indices))

    scores = model.predict(user_ids_array, all_item_indices, item_features=item_features)
    top_idxs = [i for i in np.argsort(-scores) if i not in seen][:10]
    top_items = [item_id_reverse_map[i] for i in top_idxs]

    submission.append((uid, ' '.join(map(str, top_items))))

submission_df = pd.DataFrame(submission, columns=['user_id', 'predictions'])
submission_df.to_csv("submission_kaggle.csv", index=False)