# Project 3 - Supervised Top-N Ranking

This notebook is self-contained and uses synthetic data.


In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, r2_score
import pickle
import matplotlib.pyplot as plt
np.random.seed(42)
print('Imports ready')


In [None]:

# Synthetic interactions for ranking
n_users=500; n_items=180
users=pd.DataFrame({'user_id':np.arange(1,n_users+1),'segment':np.random.choice(['A','B','C'],n_users)})
items=pd.DataFrame({'item_id':np.arange(1,n_items+1),'cuisine':np.random.choice(['Pizza','Burger','Sushi','Indian','Mexican','Salad','Dessert'],n_items),'price':np.round(np.random.normal(12,4,n_items).clip(3,60),2)})
rows=[]
for u in users['user_id']:
    pref = np.random.choice(items['cuisine'].unique())
    sampled = items.sample(40)
    for _,it in sampled.iterrows():
        relevance = 3 + (it['cuisine']==pref)*1.5 - 0.01*it['price'] + np.random.normal(0,0.6)
        rows.append({'user_id':u,'item_id':it['item_id'],'relevance':np.clip(relevance,1,5)})
inter = pd.DataFrame(rows)
inter.head()


In [None]:

# Features and model training (regressor for relevance)
df = inter.merge(users,on='user_id').merge(items,on='item_id')
df = pd.get_dummies(df, columns=['cuisine','segment'], drop_first=True)
features = [c for c in df.columns if c not in ['user_id','item_id','relevance']]
X=df[features]; y=df['relevance']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
gbr=GradientBoostingRegressor(n_estimators=200, random_state=42).fit(X_train,y_train)
print('Rank model RMSE:', mean_squared_error(y_test,gbr.predict(X_test),squared=False))
# recommend function
def recommend_topN(user_id, top_n=10):
    user = users[users['user_id']==user_id] if user_id in users['user_id'].values else users.sample(1)
    cand = items.copy()
    cand['user_id']=user['user_id'].iloc[0]
    merged = cand.merge(user, on='user_id', how='left')
    merged = pd.get_dummies(merged, columns=['cuisine','segment'], drop_first=True)
    for col in X.columns:
        if col not in merged.columns:
            merged[col]=0
    scores = gbr.predict(merged[X.columns])
    merged['score']=scores
    return merged.sort_values('score',ascending=False).head(top_n)[['item_id','cuisine','price','score']]
print(recommend_topN(users['user_id'].sample(1).iloc[0], top_n=8))
with open('project3_rank_gbr.pkl','wb') as f:
    pickle.dump(gbr,f)
print('Saved project3_rank_gbr.pkl')
