# Problem 2: Retail Customer Segmentation and Sales Prediction
This notebook is designed to execute each task step-by-step for faster and efficient processing.

## 1. Data Preprocessing and Exploratory Analysis
### Task: 
- Check for data quality issues
- Analyze correlations between variables
- Create meaningful features from the raw data (feature engineering)

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = './Datasets/Online Retail.xlsx'  # Update with the file path
data = pd.read_excel(file_path)

# Data cleaning
data.dropna  # Drop missing values
data = data[data['Quantity'] > 0]  # Remove negative quantities

# Feature Engineering
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['Year'] = data['InvoiceDate'].dt.year
data['Month'] = data['InvoiceDate'].dt.month
customer_features = data.groupby('CustomerID').agg({
    'TotalPrice': ['sum', 'mean', 'count'],
    'Quantity': ['sum', 'mean'],
    'InvoiceNo': 'nunique'
}).reset_index()
customer_features.columns = ['CustomerID', 'TotalSpent', 'AvgSpent', 'PurchaseCount', 
                             'TotalQuantity', 'AvgQuantity', 'UniqueInvoices']
customer_features.head()

Unnamed: 0,CustomerID,TotalSpent,AvgSpent,PurchaseCount,TotalQuantity,AvgQuantity,UniqueInvoices
0,12346.0,77183.6,77183.6,1,74215,74215.0,1
1,12347.0,4310.0,23.681319,182,2458,13.505495,7
2,12348.0,1797.24,57.975484,31,2341,75.516129,4
3,12349.0,1757.55,24.076027,73,631,8.643836,1
4,12350.0,334.4,19.670588,17,197,11.588235,1


## 2. Customer Segmentation
### Task:
- Apply both K-means and Hierarchical Clustering
- Compare different distance measures (Euclidean, Manhattan, etc.)
- Determine the optimal number of clusters using silhouette score

In [2]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Prepare data for clustering
scaler = StandardScaler()
scaled_data = scaler.fit_transform(customer_features.drop('CustomerID', axis=1))

# K-means clustering
kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
kmeans_labels = kmeans.fit_predict(scaled_data)
silhouette_kmeans = silhouette_score(scaled_data, kmeans_labels)

# Hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=5)
hierarchical_labels = hierarchical.fit_predict(scaled_data)
silhouette_hierarchical = silhouette_score(scaled_data, hierarchical_labels)

print(f"K-means Silhouette Score: {silhouette_kmeans}")
print(f"Hierarchical Silhouette Score: {silhouette_hierarchical}")

K-means Silhouette Score: 0.9467414220172158
Hierarchical Silhouette Score: 0.6495786986326426


## 3,4 Regression Models for Sales Prediction
### Task:
- For each customer segment, build regression models to predict future purchase amounts
- Regression Models: Decision Trees, Random Forest, Gradient Boosting, XGBoost

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import json
customer_features['Segment'] = kmeans_labels  # Use the best clustering result
models = {'DecisionTree': DecisionTreeRegressor,
          'RandomForest': RandomForestRegressor,
          'GradientBoosting': GradientBoostingRegressor,
          'XGBoost': xgb.XGBRegressor}

results = {}
for segment in customer_features['Segment'].unique():
    segment_data = customer_features[customer_features['Segment'] == segment]
    X = segment_data.drop(['CustomerID', 'Segment', 'TotalSpent'], axis=1)
    y = segment_data['TotalSpent']
    
    # Check if the segment has enough samples for splitting
    if len(X) < 2:
        print(f"Segment {segment} has less than 2 samples. Skipping train-test split.")
        continue  # Skip this segment
    
    # Perform train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    segment_results = {}
    for model_name, model_class in models.items():
        model = model_class()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred)**0.5
        r2 = r2_score(y_test, y_pred)
        segment_results[model_name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
    results[segment] = segment_results

results

Segment 4 has less than 2 samples. Skipping train-test split.
Segment 1 has less than 2 samples. Skipping train-test split.




{0: {'DecisionTree': {'MAE': 296.21081228273465,
   'RMSE': 821.0681150209646,
   'R2': 0.8985859747671091},
  'RandomForest': {'MAE': 223.6607169640787,
   'RMSE': 802.2659066475717,
   'R2': 0.9031774938981155},
  'GradientBoosting': {'MAE': 235.41151942102516,
   'RMSE': 575.2008919582315,
   'R2': 0.9502287271179363},
  'XGBoost': {'MAE': 209.17373822550098,
   'RMSE': 1234.78765302481,
   'R2': 0.7706365552447628}},
 3: {'DecisionTree': {'MAE': 28599.2525,
   'RMSE': 36252.48159033082,
   'R2': -0.28921531531335454},
  'RandomForest': {'MAE': 36523.08274999997,
   'RMSE': 43639.7842748385,
   'R2': -0.868164714177758},
  'GradientBoosting': {'MAE': 43743.113226262045,
   'RMSE': 50678.25682140713,
   'R2': -1.5193778736090344},
  'XGBoost': {'MAE': 44050.397226562505,
   'RMSE': 56218.838972651654,
   'R2': -2.1003713817424785}},
 2: {'DecisionTree': {'MAE': 7271.8399999999965,
   'RMSE': 7271.8399999999965,
   'R2': nan},
  'RandomForest': {'MAE': 55603.580299999994,
   'RMSE': 5

## 5. Apply PCA for Dimensionality Reduction
### Task:
- Reduce dimensionality and improve model performance

In [4]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=0.95)
reduced_data = pca.fit_transform(scaled_data)
print(f"Explained Variance Ratio: {np.sum(pca.explained_variance_ratio_)}")

Explained Variance Ratio: 0.9775011628108644


## 6. Temporal Validation Strategy (Optional)
### Task:
- Design a model selection and validation strategy that accounts for temporal aspects of customer behavior

In [5]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

tscv = TimeSeriesSplit(n_splits=3)  # Default number of splits
temporal_results = {}

for segment in customer_features['Segment'].unique():
    segment_data = customer_features[customer_features['Segment'] == segment]
    X = segment_data.drop(['CustomerID', 'Segment', 'TotalSpent'], axis=1)
    y = segment_data['TotalSpent']
    
    # Check if the segment has enough samples for TimeSeriesSplit
    if len(X) <= tscv.n_splits:
        print(f"Segment {segment} has fewer samples ({len(X)}) than the required number of folds ({tscv.n_splits}). Skipping temporal validation.")
        continue  # Skip this segment
    
    segment_results = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model = xgb.XGBRegressor()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred)**0.5
        r2 = r2_score(y_test, y_pred)
        segment_results.append({'MAE': mae, 'RMSE': rmse, 'R2': r2})
    
    temporal_results[segment] = segment_results

temporal_results

Segment 4 has fewer samples (1) than the required number of folds (3). Skipping temporal validation.
Segment 2 has fewer samples (3) than the required number of folds (3). Skipping temporal validation.
Segment 1 has fewer samples (1) than the required number of folds (3). Skipping temporal validation.


{0: [{'MAE': 258.70257268008936,
   'RMSE': 1540.7882052314956,
   'R2': 0.7387679837008345},
  {'MAE': 353.43290990177417,
   'RMSE': 2365.5903895910524,
   'R2': 0.33100960949773983},
  {'MAE': 204.36657988337728,
   'RMSE': 978.95689688164,
   'R2': 0.8504514442992057}],
 3: [{'MAE': 71382.43894531249,
   'RMSE': 116516.33802385171,
   'R2': -0.24032438300237136},
  {'MAE': 20086.533515625,
   'RMSE': 21581.41209811239,
   'R2': -0.6331528810584097},
  {'MAE': 80677.770703125,
   'RMSE': 104309.92692489285,
   'R2': -0.6836524394515175}]}

# 7. Implement regularization techniques to prevent overfitting

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Preprocessing numeric and categorical columns
numeric_columns = ['Quantity', 'UnitPrice', 'Year', 'Month']
categorical_columns = ['CustomerID', 'Country']

# Define features (X) and target (y)
X = data.drop(['TotalPrice'], axis=1)  # Target variable
y = data['TotalPrice']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),  # Scale numeric features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # Encode categorical features
    ]
)

# Define models
models = {
    "Ridge (L2)": Ridge(),
    "Lasso (L1)": Lasso()
}

# Hyperparameter grids for RandomizedSearchCV
param_grids = {
    "Ridge (L2)": {'model__alpha': [0.01, 0.1, 1, 10, 100]},
    "Lasso (L1)": {'model__alpha': [0.01, 0.1, 1, 10, 100]}
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Model pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Hyperparameter tuning with RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_grids[model_name],
        n_iter=10,  # Number of random combinations
        cv=3,  # 3-fold cross-validation
        scoring='neg_mean_squared_error',
        random_state=42,
        n_jobs=-1  # Parallel processing
    )
    random_search.fit(X_train, y_train)
    
    # Best model and evaluation
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        "Best Params": random_search.best_params_,
        "MAE": mae,
        "RMSE": rmse,
        "R²": r2
    }

# Display results
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    print(f"  Best Params: {metrics['Best Params']}")
    print(f"  MAE: {metrics['MAE']:.4f}")
    print(f"  RMSE: {metrics['RMSE']:.4f}")
    print(f"  R²: {metrics['R²']:.4f}")

Training Ridge (L2)...




Training Lasso (L1)...





Ridge (L2):
  Best Params: {'model__alpha': 100}
  MAE: 10.6999
  RMSE: 52.3101
  R²: 0.3539

Lasso (L1):
  Best Params: {'model__alpha': 1}
  MAE: 10.5962
  RMSE: 54.0803
  R²: 0.3094
