## **Dependencies**

In [None]:
#Required Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings(action='ignore')

## **Data Loading**

In [None]:
# Loading the datasets
order_products = pd.read_csv("../data/order_products.csv")
orders = pd.read_csv("../data/orders.csv")
products = pd.read_csv("../data/products.csv")
aisle_data = pd.read_csv("../data/aisles.csv")
departments = pd.read_csv("../data/departments.csv")

## **Data Merging**

In [None]:
# Merge aisles and departments with products for more descriptive product information
products       = products.merge(aisle_data, 
                                on  = 'aisle_id', 
                                how = 'left')

products       = products.merge(departments, 
                                on  = 'department_id', 
                                how = 'left')

# Drop any columns from order_products that will be duplicated by the merge
order_products = order_products.drop(columns = ['product_name', 'aisle_id', 'department_id'], 
                                     errors  = 'ignore')

# Now merge order_products with the enriched products dataset
order_products = order_products.merge(products, 
                                      on  = 'product_id', 
                                      how = 'left')

# Select only necessary columns from orders to avoid duplication during the merge
req_orders     = orders[['order_id', 
                         'user_id', 
                         'eval_set', 
                         'order_number', 
                         'order_dow', 
                         'order_hour_of_day', 
                         'days_since_prior_order']]

# Merge order_products with orders to add order details to each product
merged_df      = order_products.merge(req_orders, 
                                      on  = 'order_id', 
                                      how = 'left')

In [None]:
# Preview of the data
merged_df

## **Checking for Missing Values**

In [None]:
# Checking for null values
merged_df.isnull().sum()

In [None]:
# Missing value treatment of for the feature days_since_prior_order
merged_df['days_since_prior_order'] = merged_df.days_since_prior_order.fillna(-1)

In [None]:
merged_df.days_since_prior_order = merged_df.days_since_prior_order+1

In [None]:
merged_df.isnull().sum()

## **Stratified Sampling**

In [None]:
# Sampling the data using the stratified sampling
def stratified_sample(df, stratify_col, frac):
    stratified_df = df.groupby(stratify_col, group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))
    stratified_df = stratified_df.reset_index(drop=True)
    return stratified_df

sampled_data = stratified_sample(merged_df,'reordered',0.3)

In [None]:
sampled_data

In [None]:
sampled_data.isnull().sum()

## **Data Preprocessing**

In [None]:
sampled_data.columns

#### **Applying Target Encoding of aisle, department, product_name**

In [None]:
sampled_data['aisle_target_enc']        = sampled_data['aisle'].map(sampled_data.groupby('aisle')['reordered'].mean())
sampled_data['department_target_enc']   = sampled_data['department'].map(sampled_data.groupby('department')['reordered'].mean())
sampled_data['product_name_target_enc'] = sampled_data['product_name'].map(sampled_data.groupby('product_name')['reordered'].mean())

sampled_data

#### **Applying One-hot Encoding on day_of_week**

In [None]:
day_of_week_dummies = pd.get_dummies(sampled_data['order_dow'], prefix='dow').astype(int)

sampled_data = pd.concat([sampled_data, day_of_week_dummies], axis=1)

sampled_data

#### **Applying Cyclic Encoding and Perfoming Sin Cosine Transformation on order_hour_of_day**

In [None]:
sampled_data['order_hour_sin'] = np.sin(2 * np.pi * sampled_data['order_hour_of_day'] / 24)
sampled_data['order_hour_cos'] = np.cos(2 * np.pi * sampled_data['order_hour_of_day'] / 24)

sampled_data

#### **Applying Ordinal Encoding on add_to_card_order**

In [None]:
ordinal_encoder = OrdinalEncoder()

sampled_data['add_to_cart_order_encoded'] = ordinal_encoder.fit_transform(sampled_data[['add_to_cart_order']])

sampled_data

#### **Apply Binning on day_since_prior_order**

In [None]:
sampled_data['days_since_prior_order_temp']   = sampled_data['days_since_prior_order'] - 1

sampled_data['days_since_prior_order_binned'] = pd.cut(
    sampled_data['days_since_prior_order_temp'], 
    bins   = [-1, 7, 15, 23, 31], 
    labels = ['0-7', '8-15', '16-23', '24-31'], 
    right  = True
)

sampled_data['days_since_prior_order_binned'] = sampled_data['days_since_prior_order_binned'].cat.add_categories('Unknown')

sampled_data.loc[sampled_data['days_since_prior_order_binned'].isna(), 'days_since_prior_order_binned'] = 'Unknown'

sampled_data.drop(columns = ['days_since_prior_order_temp'], inplace = True)

sampled_data

## **Feature Engineering**

#### **Feature 1**: 
**average_days_between_purchases**: Average time between purchases by each product by each user. This gives insights into the typical frequency of repurcheses of a specific product

In [None]:
user_product_purchase_intervals = sampled_data.groupby(['user_id', 'product_id'])['days_since_prior_order'].mean().reset_index()
user_product_purchase_intervals.rename(columns = {'days_since_prior_order': 'average_days_between_purchases'}, inplace = True)

In [None]:
sampled_data = sampled_data.merge(user_product_purchase_intervals, on = ['user_id', 'product_id'], how = 'left')

In [None]:
sampled_data

#### **Feature 2**
**Product_purchase_frequency**: Count the total number of times each product has been purchased by a particular user.

In [None]:
product_purchase_frequency = sampled_data.groupby(['user_id', 'product_id']).size().reset_index(name='product_purchase_frequency')

In [None]:
sampled_data = sampled_data.merge(product_purchase_frequency, on=['user_id', 'product_id'], how='left')

In [None]:
sampled_data

#### **Feature 3 & 4**

- **total_purchases**:Count of purchases for each user-product pair
-  **interval_std_dev**:Std deviation of intervals between purchases

In [None]:
purchase_stats = sampled_data.groupby(['user_id', 'product_id']).agg(total_purchases  = ('order_id', 'count'), 
                                                                       interval_std_dev = ('days_since_prior_order', 'std')
                                                                      ).reset_index()

sampled_data   = sampled_data.merge(purchase_stats, 
                                      on  = ['user_id', 'product_id'], 
                                      how = 'left')

In [None]:
sampled_data

#### **Feature 5**
**Product_reorder_rate**: Reorder rate for each product by dividing the number of times a product has been reordered by the total number of orders of product

In [None]:
product_reorders   = sampled_data.groupby('product_id').agg(
    total_orders   = ('order_id', 'count'),
    total_reorders = ('reordered', 'sum')
).reset_index()
product_reorders['product_reorder_rate'] = product_reorders['total_reorders'] / product_reorders['total_orders']

In [None]:
sampled_data = sampled_data.merge(product_reorders[['product_id', 'product_reorder_rate']], on='product_id', how='left')

In [None]:
sampled_data

#### **Feature 6**
**Users_general_reorder_rate**: The ratio of reordered items to total items of each user which captures the user general tendency to reorder products.

In [None]:
user_reorders       = sampled_data.groupby('user_id').agg(
    total_items     = ('order_id', 'count'),
    reordered_items = ('reordered', 'sum')
).reset_index()
user_reorders['users_general_reorder_rate'] = user_reorders['reordered_items'] / user_reorders['total_items']

In [None]:
sampled_data = sampled_data.merge(user_reorders[['user_id', 'users_general_reorder_rate']], on = 'user_id', how = 'left')

In [None]:
sampled_data

#### **Feature 7**
**Avg_add_to_cart_order**: Avg. posititon of each product in the cart when it is purchased.

In [None]:
avg_cart_order = sampled_data.groupby('product_id')['add_to_cart_order'].mean().reset_index()
avg_cart_order.rename(columns = {'add_to_cart_order': 'avg_add_to_cart_order'}, inplace = True)

In [None]:
sampled_data = sampled_data.merge(avg_cart_order, on = 'product_id', how = 'left')

In [None]:
sampled_data

In [None]:
sampled_data.columns

## **Dropping Unnecessary Columns**

In [None]:
sampled_data.drop(columns=['add_to_cart_order','eval_set','days_since_prior_order', 'product_name', 'aisle', 'department', 'aisle_id', 'department_id', 'order_hour_of_day', 'order_dow'],inplace = True)

In [None]:
sampled_data

In [None]:
sampled_data.info()

In [None]:
sampled_data.columns

## **Correlational Matrix - Heatmap**

In [None]:
encoded_features = [
    'aisle_target_enc', 'department_target_enc', 'product_name_target_enc',
    'dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6',
    'order_hour_sin', 'order_hour_cos', 'add_to_cart_order_encoded'
]
new_features = [
    'average_days_between_purchases', 'product_purchase_frequency',
    'total_purchases', 'interval_std_dev', 'product_reorder_rate',
    'users_general_reorder_rate', 'avg_add_to_cart_order'
]

features_to_include = encoded_features + new_features

correlation_matrix  = sampled_data[features_to_include].corr()

plt.figure(figsize = (12, 10))
sns.heatmap(correlation_matrix, annot = True, fmt = '.2f', cmap = 'coolwarm', cbar = True)
plt.title('Correlation Matrix for Encoded and New Features')
plt.show()

## **Feature Importance**

In [None]:
target = 'reordered'
features = [
    'aisle_target_enc', 'department_target_enc', 'product_name_target_enc',
    'dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6',
    'order_hour_sin', 'order_hour_cos', 'add_to_cart_order_encoded',
    'average_days_between_purchases', 'product_purchase_frequency',
    'total_purchases', 'interval_std_dev', 'product_reorder_rate',
    'users_general_reorder_rate', 'avg_add_to_cart_order'
]

# Preprocess data
X = sampled_data[features]
y = sampled_data[target]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Plot feature importance
feature_importance = np.abs(model.coef_[0])
plt.figure(figsize=(10, 6))
plt.barh(features, feature_importance, color='skyblue')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance from Logistic Regression')
plt.gca().invert_yaxis()
plt.show()