<a href="https://colab.research.google.com/github/sinacipher/food-recommendation-suite/blob/main/Reorder_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os
import zipfile
import warnings
warnings.filterwarnings('ignore')

In [None]:
print("Downloading dataset...")
path = kagglehub.dataset_download("psparks/instacart-market-basket-analysis")
print("Dataset downloaded to:", path)

In [None]:
files = os.listdir(path)
csv_files = [f for f in files if f.endswith('.csv')]
zip_files = [f for f in files if f.endswith('.zip')]

data_dir = os.path.join(path, 'extracted')
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

for z in zip_files:
    with zipfile.ZipFile(os.path.join(path, z), 'r') as zip_ref:
        zip_ref.extractall(data_dir)

if len(csv_files) > 0:
    for f in csv_files:
        df_path = os.path.join(path, f)
        os.rename(df_path, os.path.join(data_dir, f))

print("CSV files available:", os.listdir(data_dir))

In [None]:
orders = pd.read_csv(os.path.join(data_dir, 'orders.csv'))
products = pd.read_csv(os.path.join(data_dir, 'products.csv'))
order_products_prior = pd.read_csv(os.path.join(data_dir, 'order_products__prior.csv'))
order_products_train = pd.read_csv(os.path.join(data_dir, 'order_products__train.csv'))
print("Data loaded successfully!")

In [None]:
orders['days_since_prior_order'] = orders['days_since_prior_order'].fillna(0)

def create_user_product_features(orders_df, order_products_df):
    user_product_stats = order_products_df.merge(orders_df[orders_df.eval_set == 'prior'], on='order_id')
    user_product_features = user_product_stats.groupby(['user_id', 'product_id']).agg(
        up_orders=('order_id', 'count'),
        up_first_order=('order_number', 'min'),
        up_last_order=('order_number', 'max'),
        up_avg_cart_position=('add_to_cart_order', 'mean')
    ).reset_index()
    return user_product_features

def create_user_features(orders_df, order_products_df):
    user_stats = orders_df[orders_df.eval_set == 'prior'].groupby('user_id').agg(
        user_orders=('order_number', 'max'),
        user_period=('days_since_prior_order', 'sum'),
        user_avg_days_since_prior=('days_since_prior_order', 'mean')
    ).reset_index()
    user_order_products = order_products_df.merge(orders_df[orders_df.eval_set == 'prior'], on='order_id')
    user_product_stats = user_order_products.groupby('user_id').agg(
        user_total_products=('product_id', 'count'),
        user_reorder_ratio=('reordered', 'mean')
    ).reset_index()
    user_features = user_stats.merge(user_product_stats, on='user_id')
    return user_features

def create_product_features(order_products_df):
    product_features = order_products_df.groupby('product_id').agg(
        prod_orders=('order_id', 'count'),
        prod_reorders=('reordered', 'sum'),
        prod_first_orders=('order_id', lambda x: (x == 1).sum()),
        prod_second_orders=('order_id', lambda x: (x == 2).sum()),
        prod_avg_cart_position=('add_to_cart_order', 'mean')
    ).reset_index()
    product_features['prod_reorder_probability'] = np.where(
        product_features.prod_orders > 0,
        product_features.prod_reorders / product_features.prod_orders,
        0
    )
    product_features['prod_reorder_ratio'] = np.where(
        product_features.prod_first_orders > 0,
        product_features.prod_reorders / product_features.prod_first_orders,
        0
    )
    return product_features.fillna(0)

user_product_features = create_user_product_features(orders, order_products_prior)
user_features = create_user_features(orders, order_products_prior)
product_features = create_product_features(order_products_prior)

In [None]:
train_orders = orders[orders.eval_set == 'train']
train = train_orders.merge(user_features, on='user_id')
train_products = order_products_train.merge(train_orders, on='order_id')[['user_id', 'product_id', 'reordered']]
sample_users = train.user_id.unique()[:5000]
train_sample = train[train.user_id.isin(sample_users)]
user_products = user_product_features[user_product_features.user_id.isin(sample_users)]
train_data = user_products.merge(
    train_products[['user_id', 'product_id', 'reordered']],
    on=['user_id', 'product_id'],
    how='left'
)
train_data['reordered'] = train_data['reordered'].fillna(0)
train_data = train_data.merge(user_features, on='user_id')
train_data = train_data.merge(product_features, on='product_id')
X = train_data.drop(['user_id', 'product_id', 'reordered'], axis=1)
y = train_data['reordered']
feature_names = X.columns.tolist()
X = X.fillna(0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=-1, max_depth=10, class_weight='balanced')
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
feature_importance = pd.DataFrame({'feature': feature_names,'importance': model.feature_importances_}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
def predict_reorders(user_id, model, scaler, user_features, product_features, user_product_features, feature_names, top_n=10):
    user_data = user_features[user_features.user_id == user_id]
    user_products = user_product_features[user_product_features.user_id == user_id]
    if len(user_products) == 0:
        return []
    user_products = user_products.merge(product_features, on='product_id')
    for col in user_data.columns:
        if col != 'user_id':
            user_products[col] = user_data[col].values[0]
    X_user = user_products[feature_names].copy()
    X_user = X_user.fillna(0)
    X_user_scaled = scaler.transform(X_user)
    probabilities = model.predict_proba(X_user_scaled)[:, 1]
    user_products['reorder_probability'] = probabilities
    top_products = user_products.sort_values('reorder_probability', ascending=False).head(top_n)
    return top_products[['product_id', 'reorder_probability']]

In [None]:
user_id_example = 1
top_reorders = predict_reorders(user_id_example, model, scaler, user_features, product_features,
                               user_product_features, feature_names)
print(top_reorders)

In [None]:
import joblib
joblib.dump(model, 'reorder_prediction_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(feature_names, 'feature_names.pkl')

In [None]:
def load_and_predict(user_id, user_features, product_features, user_product_features):
    model = joblib.load('reorder_prediction_model.pkl')
    scaler = joblib.load('scaler.pkl')
    feature_names = joblib.load('feature_names.pkl')
    return predict_reorders(user_id, model, scaler, user_features, product_features,
                           user_product_features, feature_names)

In [None]:
user_id_example = 2
top_reorders = load_and_predict(user_id_example, user_features, product_features, user_product_features)
print(top_reorders)