In [None]:
#Module 1 
import pandas as pd
import numpy as np

print("‚öôÔ∏è Initiating Master Dataset Preparation...")

# 1. LOAD THE RAW DATASETS
try:
    materials = pd.read_csv('packaging_materials.csv')
    products = pd.read_csv('product_dataset.csv')
    shipping = pd.read_csv('shipping_dataset.csv')
except FileNotFoundError as e:
    print(f"‚ùå Error: Could not find dataset files. Ensure they are in the same folder.\n{e}")
    exit()

for df in [materials, products, shipping]:
    df.columns = df.columns.str.lower().str.strip()

# 2. CROSS-JOIN (Create every possible scenario)
# This creates a row for every material, for every product, via every shipping method.
materials['key'] = 1
products['key'] = 1
shipping['key'] = 1
unified_df = pd.merge(products, materials, on='key').merge(shipping, on='key')
unified_df.drop('key', axis=1, inplace=True)

print(f"‚úÖ Generated {len(unified_df)} unique scenarios.")

# ==========================================
# 3. REAL-WORLD LOGISTICS CALCULATIONS
# ==========================================
print("üßÆ Calculating realistic Cost & CO‚ÇÇ using dimensions and rules...")

# --- A. Dimensional Packaging Math ---
# Surface area of the product box = 2(LW + WH + HL) / 10,000 to convert sq cm to sq meters
unified_df['surface_area_m2'] = (2 * (
    (unified_df['length_cm'] * unified_df['width_cm']) + 
    (unified_df['width_cm'] * unified_df['height_cm']) + 
    (unified_df['height_cm'] * unified_df['length_cm'])
) / 10000) * 1.25 # 25% extra for folds, overlaps, and protective padding

# How many layers of this material do we need to hold this product's weight?
unified_df['layers_needed'] = np.ceil(unified_df['avg_weight'] / unified_df['weight_capacity']).clip(lower=1)
unified_df['total_material_units'] = unified_df['surface_area_m2'] * unified_df['layers_needed']


# --- B. Shipping Mode Physics ---
def calculate_shipping_rates(row):
    mode = str(row['shipping_type']).lower()
    # Rates format: [Cost per kg per km, CO2 per kg per km]
    if "air" in mode or "express" in mode or "same day" in mode:
        return pd.Series([0.12, 0.008])  # Expensive & High CO2
    elif "cold" in mode or "refrigerated" in mode or "frozen" in mode:
        return pd.Series([0.08, 0.006])  # High energy required
    elif "city" in mode or "local" in mode:
        return pd.Series([0.05, 0.003])  # Stop-and-go traffic
    elif "bulk" in mode or "international" in mode:
        return pd.Series([0.02, 0.0015]) # Economies of scale
    else:
        return pd.Series([0.04, 0.002])  # Standard National/Regional transport

unified_df[['ship_cost_rate', 'ship_co2_rate']] = unified_df.apply(calculate_shipping_rates, axis=1)


# --- C. Industry Specific Rules ---
def apply_industry_rules(row):
    industry = str(row['industry_type']).lower()
    if industry == "electronics":
        return 1.4  # Requires extra anti-static bubble wrap/foam
    elif industry in ["food", "pharma", "cosmetics"]:
        return 1.3  # Requires thermal/hygienic liners
    return 1.0

unified_df['industry_multiplier'] = unified_df.apply(apply_industry_rules, axis=1)


# ==========================================
# 4. FINAL GROUND TRUTH GENERATION
# ==========================================

# Final Cost = (Packaging Material) + (Transport: Weight x Distance x Rate) + (Risk Penalty)
unified_df['target_cost'] = (
    (unified_df['cost_per_unit'] * unified_df['total_material_units'] * unified_df['industry_multiplier']) +
    (unified_df['avg_weight'] * unified_df['distance_km'] * unified_df['ship_cost_rate']) +
    (unified_df['handling_risk'] * unified_df['fragility_level'] * 2.5) # Penalty for breaking fragile items
).round(2)

# Final CO2 = (Packaging Manufacturing) + (Transport Emission: Weight x Distance x Rate)
unified_df['target_co2'] = (
    (unified_df['co2_emission_score'] * unified_df['total_material_units']) +
    (unified_df['avg_weight'] * unified_df['distance_km'] * unified_df['ship_co2_rate'])
).round(2)

# Add extreme penalty for highly unrealistic scenarios (e.g. putting a 10kg item in a 1kg capacity bag)
unified_df.loc[unified_df['avg_weight'] > (unified_df['weight_capacity'] * 3), 'target_cost'] += 5000  

# 5. SAVE THE PERFECT DATASET
output_file = 'unified_scenarios_dataset.csv'
unified_df.to_csv(output_file, index=False)

print(f"\nüéâ SUCCESS! The master dataset has been saved to: {output_file}")

‚öôÔ∏è Initiating Master Dataset Preparation...
‚úÖ Generated 22500 unique scenarios.
üßÆ Calculating realistic Cost & CO‚ÇÇ using dimensions and rules...

üéâ SUCCESS! The master dataset has been saved to: unified_scenarios_dataset.csv


In [None]:
#Module 2
import pandas as pd
import numpy as np

def perform_eda(file_name='unified_scenarios_dataset.csv', output_name='eda_summary_statistics.csv'):
    print(f"üîç Initiating Exploratory Data Analysis (EDA) for '{file_name}'...\n")
    
    try:
        # Load the dataset
        df = pd.read_csv(file_name)
    except FileNotFoundError:
        print(f"‚ùå Error: Could not find '{file_name}'. Make sure it's in the same folder!")
        return

    # ==========================================
    # 1. DATASET SHAPE & STRUCTURE
    # ==========================================
    print("=== 1. DATASET SHAPE ===")
    print(f"‚û§ Total Records (Rows): {df.shape[0]:,}")
    print(f"‚û§ Total Features (Columns): {df.shape[1]}")
    
    # ==========================================
    # 2. DATA TYPES & FEATURE CLASSIFICATION
    # ==========================================
    print("\n=== 2. FEATURE CLASSIFICATION ===")
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"‚û§ Numerical Features ({len(num_cols)}): {', '.join(num_cols[:5])} ...")
    print(f"‚û§ Categorical Features ({len(cat_cols)}): {', '.join(cat_cols[:5])} ...")

    # ==========================================
    # 3. MISSING VALUES CHECK (Crucial for Mentors)
    # ==========================================
    print("\n=== 3. DATA CLEANLINESS (MISSING VALUES) ===")
    missing_data = df.isnull().sum()
    missing_cols = missing_data[missing_data > 0]
    
    if missing_cols.empty:
        print("‚úÖ Outstanding! The dataset is perfectly clean with ZERO missing values.")
    else:
        print("‚ö†Ô∏è Missing values detected in the following columns:")
        for col, count in missing_cols.items():
            print(f"   - {col}: {count} missing values")

    # ==========================================
    # 4. DUPLICATE CHECK
    # ==========================================
    print("\n=== 4. DUPLICATE RECORDS ===")
    duplicates = df.duplicated().sum()
    if duplicates == 0:
        print("‚úÖ No duplicate rows found.")
    else:
        print(f"‚ö†Ô∏è Found {duplicates} duplicate rows.")

    # ==========================================
    # 5. GENERATE & SAVE SUMMARY STATISTICS
    # ==========================================
    print("\n=== 5. STATISTICAL SUMMARY ===")
    # Calculate count, mean, std, min, 25%, 50%, 75%, max for all numbers
    summary_stats = df.describe().round(2)
    
    # Print a small preview of the first 4 columns to the terminal
    print(summary_stats.iloc[:, :4]) 
    
    # Save the full report to CSV for your submission
    summary_stats.to_csv(output_name)
    print(f"\nüìÅ SUCCESS: Full EDA statistical report saved to '{output_name}'!")
    print("üéØ You can submit this CSV directly to your mentor.")

if __name__ == "__main__":
    perform_eda()

üîç Initiating Exploratory Data Analysis (EDA) for 'unified_scenarios_dataset.csv'...

=== 1. DATASET SHAPE ===
‚û§ Total Records (Rows): 22,500
‚û§ Total Features (Columns): 30

=== 2. FEATURE CLASSIFICATION ===
‚û§ Numerical Features (26): avg_weight, fragility_level, shelf_life_days, moisture_sensitivity, chemical_sensitivity ...
‚û§ Categorical Features (4): product_name, industry_type, material_type, shipping_type ...

=== 3. DATA CLEANLINESS (MISSING VALUES) ===
‚úÖ Outstanding! The dataset is perfectly clean with ZERO missing values.

=== 4. DUPLICATE RECORDS ===
‚úÖ No duplicate rows found.

=== 5. STATISTICAL SUMMARY ===
       avg_weight  fragility_level  shelf_life_days  moisture_sensitivity
count    22500.00         22500.00         22500.00              22500.00
mean         0.91             5.27          1603.23                  4.90
std          1.02             2.22          1498.86                  2.86
min          0.10             2.00             7.00              

In [None]:
#Module 3
import pandas as pd
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

print("‚öôÔ∏è Initiating Module 3: ML Dataset Preparation...")
os.makedirs('artifacts', exist_ok=True)

# ==========================================
# 1. LOAD THE TARGET-READY DATASET
# ==========================================
# This dataset already has our perfectly calculated 'target_cost' and 'target_co2'
df = pd.read_csv('unified_scenarios_dataset.csv')

# ==========================================
# 2. SELECT ML FEATURES FOR PREDICTION
# ==========================================
# We isolate the 13 physical and categorical features the AI is allowed to learn from.
features = [
    'strength', 'cost_per_unit', 'co2_emission_score', 'weight_capacity', 
    'avg_weight', 'fragility_level', 'distance_km', 'handling_risk',
    'length_cm', 'width_cm', 'height_cm', 'shipping_type', 'industry_type'
]

X = df[features]
y_cost = df['target_cost']
y_co2 = df['target_co2']

# ==========================================
# 3. SPLIT DATA INTO TRAINING (80%) AND TESTING (20%)
# ==========================================
print(f"üîÄ Splitting {len(df)} records into Training and Testing sets...")

# Split for Cost Prediction
X_train, X_test, y_cost_train, y_cost_test = train_test_split(X, y_cost, test_size=0.2, random_state=42)

# Split for CO2 Prediction (Reusing the same X split to ensure consistency)
_, _, y_co2_train, y_co2_test = train_test_split(X, y_co2, test_size=0.2, random_state=42)

print(f"   ‚û§ Training Set: {len(X_train)} rows")
print(f"   ‚û§ Testing Set:  {len(X_test)} rows")

# ==========================================
# 4. PREPARE DATA PIPELINES & SCALING
# ==========================================
print("‚öñÔ∏è Building Scaling and Encoding Pipelines...")

# Identify numerical vs categorical columns
num_cols = [
    'strength', 'cost_per_unit', 'co2_emission_score', 'weight_capacity', 
    'avg_weight', 'fragility_level', 'distance_km', 'handling_risk', 
    'length_cm', 'width_cm', 'height_cm'
]
cat_cols = ['shipping_type', 'industry_type']

# Build the ColumnTransformer Pipeline
# - StandardScaler: Normalizes continuous numbers (e.g., distance, weight) to mean=0, std=1
# - OneHotEncoder: Converts categories (e.g., 'Air Cargo') into binary 1s and 0s
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

# FIT AND TRANSFORM the training data, but ONLY TRANSFORM the testing data (Prevents Data Leakage)
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# ==========================================
# 5. SAVE ARTIFACTS FOR MODULE 4
# ==========================================
# Saving the fitted preprocessor is crucial so app.py scales new user inputs the exact same way!
joblib.dump(preprocessor, 'artifacts/preprocessor.pkl')

print("‚úÖ Module 3 Complete! Data is split, scaled, encoded, and ready for Model Training.")

‚öôÔ∏è Initiating Module 3: ML Dataset Preparation...
üîÄ Splitting 22500 records into Training and Testing sets...
   ‚û§ Training Set: 18000 rows
   ‚û§ Testing Set:  4500 rows
‚öñÔ∏è Building Scaling and Encoding Pipelines...
‚úÖ Module 3 Complete! Data is split, scaled, encoded, and ready for Model Training.


In [None]:
#Module 4
import pandas as pd
import numpy as np
import joblib
import os
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("‚öôÔ∏è Initiating Module 4: ML Model Training & Evaluation...")
os.makedirs('artifacts', exist_ok=True)

# ==========================================
# 1. LOAD & PREPARE DATA (From Module 3)
# ==========================================
print("üì¶ Loading Unified Scenario Dataset...")
df = pd.read_csv('unified_scenarios_dataset.csv')

features = [
    'strength', 'cost_per_unit', 'co2_emission_score', 'weight_capacity', 
    'avg_weight', 'fragility_level', 'distance_km', 'handling_risk',
    'length_cm', 'width_cm', 'height_cm', 'shipping_type', 'industry_type'
]

X = df[features]
y_cost = df['target_cost']
y_co2 = df['target_co2']

X_train, X_test, yc_train, yc_test = train_test_split(X, y_cost, test_size=0.2, random_state=42)
_, _, ye_train, ye_test = train_test_split(X, y_co2, test_size=0.2, random_state=42)

# Pipeline Definition
num_cols = ['strength', 'cost_per_unit', 'co2_emission_score', 'weight_capacity', 'avg_weight', 
            'fragility_level', 'distance_km', 'handling_risk', 'length_cm', 'width_cm', 'height_cm']
cat_cols = ['shipping_type', 'industry_type']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

X_train_p = preprocessor.fit_transform(X_train)
X_test_p = preprocessor.transform(X_test)

# ==========================================
# 2. TRAIN MODELS WITH TQDM PROGRESS BARS
# ==========================================
N_ESTIMATORS = 100

print("\nüß† Training Random Forest (Cost Prediction Model)...")
# Using warm_start to iteratively add trees and update TQDM
cost_model = RandomForestRegressor(n_estimators=1, warm_start=True, random_state=42, n_jobs=-1)

with tqdm(total=N_ESTIMATORS, desc="Random Forest Epochs", unit="tree") as pbar:
    for i in range(1, N_ESTIMATORS + 1):
        cost_model.n_estimators = i
        cost_model.fit(X_train_p, yc_train)
        pbar.update(1)

print("\nüå± Training XGBoost (CO‚ÇÇ Prediction Model)...")
# Bulletproof XGBoost Training
co2_model = XGBRegressor(n_estimators=N_ESTIMATORS, learning_rate=0.1, max_depth=8, random_state=42)

with tqdm(total=1, desc="XGBoost Training", unit="model") as pbar:
    co2_model.fit(X_train_p, ye_train)
    pbar.update(1)

# ==========================================
# 3. EVALUATION METRICS (RMSE, MAE, R¬≤)
# ==========================================
def evaluate(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    print(f"\nüìä {name} Performance:")
    print(f"   ‚û§ MAE:  ¬±{mae:.2f}")
    print(f"   ‚û§ RMSE: {rmse:.2f}")
    print(f"   ‚û§ R¬≤:   {r2:.4f}")
    return {"MAE": round(mae, 4), "RMSE": round(rmse, 4), "R2": round(r2, 4)}

metrics = {
    "Cost_Prediction_RF": evaluate(yc_test, cost_model.predict(X_test_p), "Random Forest (Cost)"),
    "CO2_Prediction_XGB": evaluate(ye_test, co2_model.predict(X_test_p), "XGBoost (CO‚ÇÇ)")
}

# ==========================================
# 4. SAVE ARTIFACTS
# ==========================================
joblib.dump(preprocessor, 'artifacts/preprocessor.pkl')
joblib.dump(cost_model, 'artifacts/cost_model.pkl')
joblib.dump(co2_model, 'artifacts/co2_model.pkl')

with open('artifacts/evaluation_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)

print("\n‚úÖ Models and Metrics saved successfully to the 'artifacts' folder.")

# ==========================================
# 5. ML-BASED MATERIAL RANKING SYSTEM
# ==========================================
# ==========================================
# 5. ML-BASED MATERIAL RANKING SYSTEM (Upgraded)
# ==========================================
print("\nüèÜ TESTING THE ML-BASED RANKING SYSTEM...")
def rank_materials_for_scenario(scenario_index=0):
    # 1. Grab a sample product & shipping configuration
    sample_df = X_test.iloc[[scenario_index]].copy()
    prod_weight = sample_df['avg_weight'].values[0]
    print(f"Scenario: Shipping a {prod_weight}kg item via {sample_df['shipping_type'].values[0]}")

    # 2. Simulate 5 different materials
    simulation_df = pd.concat([sample_df]*5, ignore_index=True)
    simulation_materials = ['Corrugated Box', 'Bamboo Crate', 'Recycled Mailer', 'Molded Pulp', 'Plastic Polybag']
    simulation_df['strength'] = [6, 9, 4, 5, 2]
    simulation_df['cost_per_unit'] = [8.0, 25.0, 3.5, 6.0, 1.0]
    simulation_df['co2_emission_score'] = [3, 1, 2, 2, 9]
    simulation_df['weight_capacity'] = [12, 30, 5, 8, 3]

    # 3. Predict Cost and CO2
    sim_processed = preprocessor.transform(simulation_df)
    pred_costs = cost_model.predict(sim_processed)
    pred_co2s = co2_model.predict(sim_processed)

    # 4. ADVANCED RANKING LOGIC
    def normalize(arr): 
        return (arr - np.min(arr)) / (np.max(arr) - np.min(arr) + 1e-9)
    
    cost_scores = normalize(pred_costs)
    co2_scores = normalize(pred_co2s)
    
    # Calculate Over-packaging / Waste Penalty
    # If a material holds 30kg but the item is 0.3kg, penalty is huge!
    waste_ratio = simulation_df['weight_capacity'] / prod_weight
    waste_penalty = normalize(waste_ratio)

    # Material Eco-Friendliness (Inherent CO2 score of the material itself)
    eco_scores = normalize(simulation_df['co2_emission_score'])

    # Final Smart Score (Lower is better)
    # 30% Route Cost + 30% Route CO2 + 20% Waste Penalty + 20% Inherent Material Eco-Score
    final_scores = (cost_scores * 0.30) + (co2_scores * 0.30) + (waste_penalty * 0.20) + (eco_scores * 0.20)

    # 5. Output the Ranked List
    results = pd.DataFrame({
        'Material': simulation_materials,
        'Pred Cost (‚Çπ)': np.round(pred_costs, 2),
        'Pred CO‚ÇÇ (kg)': np.round(pred_co2s, 2),
        'Waste Penalty': np.round(waste_penalty, 2), # Showing the penalty for clarity
        'AI Score': np.round(final_scores, 4)
    }).sort_values('AI Score', ascending=True).reset_index(drop=True)
    
    results.index += 1 
    results.index.name = 'Rank'
    print("\n--- AI Recommended Material Rankings ---")
    print(results)

rank_materials_for_scenario(0)

‚öôÔ∏è Initiating Module 4: ML Model Training & Evaluation...
üì¶ Loading Unified Scenario Dataset...

üß† Training Random Forest (Cost Prediction Model)...


Random Forest Epochs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:10<00:00,  9.92tree/s]



üå± Training XGBoost (CO‚ÇÇ Prediction Model)...


XGBoost Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  2.57model/s]



üìä Random Forest (Cost) Performance:
   ‚û§ MAE:  ¬±0.09
   ‚û§ RMSE: 0.25
   ‚û§ R¬≤:   1.0000

üìä XGBoost (CO‚ÇÇ) Performance:
   ‚û§ MAE:  ¬±0.02
   ‚û§ RMSE: 0.03
   ‚û§ R¬≤:   1.0000

‚úÖ Models and Metrics saved successfully to the 'artifacts' folder.

üèÜ TESTING THE ML-BASED RANKING SYSTEM...
Scenario: Shipping a 0.4kg item via Regional Transport

--- AI Recommended Material Rankings ---
             Material  Pred Cost (‚Çπ)  Pred CO‚ÇÇ (kg)  Waste Penalty  AI Score
Rank                                                                        
1     Recycled Mailer          95.88           0.49           0.07    0.0398
2         Molded Pulp          95.89           0.49           0.19    0.0645
3      Corrugated Box          96.00           0.54           0.33    0.1911
4        Bamboo Crate          97.13           0.49           1.00    0.5000
5     Plastic Polybag          95.91           0.80           0.00    0.5088
