In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
""" 
# Load your user metadata
df_users = pd.read_csv("UserData.csv")
df_users['user_id'] = df_users['guid'].astype('category').cat.codes
 
# Simulate product catalog
products = {
    'item_id': [101, 102, 103, 104, 105],
    'description': [
        "yoga mat non-slip fitness",
        "protein bar organic vegan",
        "home gym treadmill exercise",
        "smartwatch health fitness",
        "running shoes lightweight"
    ]
}
df_products = pd.DataFrame(products)

# Simulate interactions
interactions = []
for user_id in df_users['user_id']:
    liked_items = np.random.choice(df_products['item_id'], size=2, replace=False)
    for item in liked_items:
        interactions.append({'user_id': user_id, 'item_id': item, 'rating': 1})
#df_interact = pd.DataFrame(interactions)"""
df_interact = pd.read_csv('../data/UserItemData.csv')
# TF-IDF over product descriptions
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_products['description'])
cos_sim = cosine_similarity(tfidf_matrix)

# Recommendation function
def recommend(user_id, k=5):
    rated_items = df_interact[df_interact['user_id'] == user_id]['item_id'].values
    if len(rated_items) == 0:
        return []

    scores = np.zeros(len(df_products))
    for item in rated_items:
        idx = df_products[df_products['item_id'] == item].index[0]
        scores += cos_sim[idx]

    rated_idxs = [df_products[df_products['item_id'] == i].index[0] for i in rated_items]
    scores[rated_idxs] = 0  # mask already rated

    top_indices = np.argsort(scores)[::-1][:k]
    return df_products.iloc[top_indices]['item_id'].tolist()

# Evaluate
y_true, y_pred = [], []
recall_5, recall_20 = [], []

for user_id in df_users['user_id'].unique():
    true_items = df_interact[df_interact['user_id'] == user_id]['item_id'].values
    top5 = recommend(user_id, k=5)
    top20 = recommend(user_id, k=20)

    # Recall
    recall_5.append(len(set(true_items) & set(top5)) / len(true_items))
    recall_20.append(len(set(true_items) & set(top20)) / len(true_items))

    # Binary prediction for RMSE
    for item in true_items:
        y_true.append(1)
        y_pred.append(1 if item in top5 else 0)

# Metrics
rmse = root_mean_squared_error(y_true, y_pred)
print(f"RMSE: {rmse:.4f}")
print(f"Recall@5: {np.mean(recall_5):.4f}")
print(f"Recall@20: {np.mean(recall_20):.4f}")


RMSE: 0.0000
Recall@5: 1.0000
Recall@20: 1.0000


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import root_mean_squared_error, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# ---------- Simulated Sample Dataset ----------
users = ['U1', 'U2', 'U3', 'U4']
items = [101, 102, 103, 104, 105]
descriptions = [
    "yoga mat non-slip fitness gym",
    "protein bar organic energy",
    "home gym treadmill workout",
    "smartwatch fitness tracker health",
    "lightweight running shoes sport"
]

interactions = [
    ('U1', 101, 1), ('U1', 102, 1),
    ('U2', 101, 1), ('U2', 103, 1),
    ('U3', 104, 1), ('U3', 105, 1),
    ('U4', 102, 1), ('U4', 104, 1)
]

df_items = pd.DataFrame({'item_id': items, 'description': descriptions})
df_interact = pd.DataFrame(interactions, columns=['user_id', 'item_id', 'rating'])

# ---------- TF-IDF Encoding ----------
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_items['description'])
cosine_sim = cosine_similarity(tfidf_matrix)

# ---------- Create Recommendation Function ----------
def recommend_tfidf(user_id, top_k=10):
    rated_items = df_interact[df_interact['user_id'] == user_id]['item_id'].tolist()
    if not rated_items:
        return []

    scores = np.zeros(len(df_items))
    for item in rated_items:
        idx = df_items[df_items['item_id'] == item].index[0]
        scores += cosine_sim[idx]

    rated_indices = [df_items[df_items['item_id'] == i].index[0] for i in rated_items]
    scores[rated_indices] = 0  # Mask known items

    top_indices = scores.argsort()[::-1][:top_k]
    return df_items.iloc[top_indices]['item_id'].tolist()

# ---------- Evaluation ----------
y_true_all = []
y_pred_all = []
precision_list = []
recall_list = []

for user in df_interact['user_id'].unique():
    true_items = df_interact[df_interact['user_id'] == user]['item_id'].tolist()
    rec_items = recommend_tfidf(user, top_k=10)

    y_true = [1 if item in true_items else 0 for item in rec_items]
    y_pred = [1] * len(y_true)

    y_true_all += y_true
    y_pred_all += y_pred

    # Per-user metrics
    intersection = len(set(true_items) & set(rec_items))
    precision = intersection / len(rec_items) if rec_items else 0
    recall = intersection / len(true_items) if true_items else 0

    precision_list.append(precision)
    recall_list.append(recall)

# ---------- Metrics ----------
rmse = root_mean_squared_error(y_true_all, y_pred_all)
precision_at_10 = np.mean(precision_list)
recall_at_10 = np.mean(recall_list)

print(f"TF-IDF Results:")
print(f"RMSE: {rmse:.4f}")
print(f"Precision@10: {precision_at_10:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")


TF-IDF Results:
RMSE: 0.7746
Precision@10: 0.4000
Recall@10: 1.0000


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import root_mean_squared_error

# Load user metadata
df_users = pd.read_csv("UserData.csv")
df_users['user_id'] = df_users['guid'].astype('category').cat.codes

# Load item data
df_items = pd.read_csv("BigBasket Products.csv")

# Make sure there's a product description column (you can adjust as needed)
desc_col = 'product'  # Change if your column is named differently
df_items['item_id'] = df_items.index

# Simulate user-item interactions
interactions = []
np.random.seed(42)
for user_id in df_users['user_id']:
    liked_items = np.random.choice(df_items['item_id'], size=2, replace=False)
    for item in liked_items:
        interactions.append({'user_id': user_id, 'item_id': item, 'rating': 1})
df_interact = pd.DataFrame(interactions)

# TF-IDF vectorization on product descriptions
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_items[desc_col].astype(str))
cosine_sim = cosine_similarity(tfidf_matrix)

# Recommendation function
def recommend_tfidf(user_id, top_k=10):
    rated_items = df_interact[df_interact['user_id'] == user_id]['item_id'].tolist()
    if not rated_items:
        return []

    scores = np.zeros(len(df_items))
    for item in rated_items:
        idx = df_items[df_items['item_id'] == item].index[0]
        scores += cosine_sim[idx]

    rated_indices = [df_items[df_items['item_id'] == i].index[0] for i in rated_items]
    scores[rated_indices] = 0

    top_indices = scores.argsort()[::-1][:top_k]
    return df_items.iloc[top_indices]['item_id'].tolist()

# Evaluate RMSE, Precision@10, Recall@10
y_true_all, y_pred_all = [], []
precision_list, recall_list = [], []

for user in df_users['user_id'].unique():
    true_items = df_interact[df_interact['user_id'] == user]['item_id'].tolist()
    rec_items = recommend_tfidf(user, top_k=10)

    y_true = [1 if item in true_items else 0 for item in rec_items]
    y_pred = [1] * len(y_true)

    y_true_all += y_true
    y_pred_all += y_pred

    intersection = len(set(true_items) & set(rec_items))
    precision = intersection / len(rec_items) if rec_items else 0
    recall = intersection / len(true_items) if true_items else 0

    precision_list.append(precision)
    recall_list.append(recall)

# Output metrics
rmse = root_mean_squared_error(y_true_all, y_pred_all)
precision_at_10 = np.mean(precision_list)
recall_at_10 = np.mean(recall_list)

print(f"\nTF-IDF Recommendation Results:")
print(f"RMSE: {rmse:.4f}")
print(f"Precision@10: {precision_at_10:.4f}")
print(f"Recall@10: {recall_at_10:.4f}")


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from collections import defaultdict

# ✅ Load and process UserData.csv
df_users = pd.read_csv("UserData.csv")
df_users['user_id'] = df_users['guid'].astype("category").cat.codes

# ✅ Load and process BigBasket Products.csv
df_items = pd.read_csv("BigBasket Products.csv")
df_items['item_id'] = df_items.index  # Create numeric ID

# ✅ Simulate binary interactions (likes)
interactions = []
np.random.seed(42)
for user_id in df_users['user_id']:
    liked = np.random.choice(df_items['item_id'], size=2, replace=False)
    for item in liked:
        interactions.append({'user_id': user_id, 'item_id': item, 'rating': 1})
df = pd.DataFrame(interactions)

# ✅ Create unique numeric values
num_users = df['user_id'].nunique()
num_items = df['item_id'].nunique()

# ✅ Train-test split
train, test = train_test_split(df, test_size=0.2, random_state=42)

# ✅ Convert to arrays for TensorFlow
X_train_user = np.array(train['user_id'])
X_train_item = np.array(train['item_id'])
y_train = np.array(train['rating'])

X_test_user = np.array(test['user_id'])
X_test_item = np.array(test['item_id'])
y_test = np.array(test['rating'])

# ✅ Build the NCF model with GPU context
with tf.device('/GPU:0'):
    user_input = tf.keras.layers.Input(shape=(1,))
    item_input = tf.keras.layers.Input(shape=(1,))

    user_vec = tf.keras.layers.Embedding(input_dim=num_users, output_dim=32)(user_input)
    item_vec = tf.keras.layers.Embedding(input_dim=num_items, output_dim=32)(item_input)

    user_vec = tf.keras.layers.Flatten()(user_vec)
    item_vec = tf.keras.layers.Flatten()(item_vec)

    x = tf.keras.layers.Concatenate()([user_vec, item_vec])
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=[user_input, item_input], outputs=output)
    model.compile(optimizer='adam', loss='mse')

# ✅ Train
model.fit([X_train_user, X_train_item], y_train,
          batch_size=64,
          epochs=10,
          validation_split=0.1,
          verbose=1)

# ✅ RMSE on test
test_preds = model.predict([X_test_user, X_test_item], verbose=0)
rmse = root_mean_squared_error(y_test, test_preds)
print(f"\n✅ NCF RMSE: {rmse:.4f}")

# ✅ Precision@10 and Recall@10
all_preds = defaultdict(list)
for user in df_users['user_id']:
    items = df_items['item_id'].tolist()
    user_vec = [user] * len(items)
    preds = model.predict([np.array(user_vec), np.array(items)], verbose=0).flatten()
    top_items = np.argsort(preds)[::-1][:10]
    all_preds[user] = [items[i] for i in top_items]

precision_list, recall_list = [], []
for user in df_users['user_id']:
    true_items = df[df['user_id'] == user]['item_id'].tolist()
    pred_items = all_preds[user]
    intersection = len(set(pred_items) & set(true_items))
    precision = intersection / 10
    recall = intersection / len(true_items) if true_items else 0
    precision_list.append(precision)
    recall_list.append(recall)

print(f"✅ Precision@10: {np.mean(precision_list):.4f}")
print(f"✅ Recall@10: {np.mean(recall_list):.4f}")


Epoch 1/10
[1m7420/7420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - loss: 0.0072 - val_loss: 6.9257e-08
Epoch 2/10
[1m7420/7420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4ms/step - loss: 3.2343e-08 - val_loss: 1.7328e-09
Epoch 3/10
[1m7420/7420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4ms/step - loss: 8.7595e-10 - val_loss: 9.7709e-11
Epoch 4/10
[1m7420/7420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - loss: 6.0205e-11 - val_loss: 2.2887e-11
Epoch 5/10
[1m7420/7420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 4ms/step - loss: 1.7463e-11 - val_loss: 1.1870e-11
Epoch 6/10
[1m7420/7420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 4ms/step - loss: 9.7570e-12 - val_loss: 7.9662e-12
Epoch 7/10
[1m7420/7420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 4ms/step - loss: 6.7711e-12 - val_loss: 5.9796e-12
Epoch 8/10
[1m7420/7420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 4ms/ste