# üß™ DejaBrew Forecasting - Upload Your Own CSV

**Use this notebook if you already have `coffee_shop_sales.csv` downloaded locally**

This simplified notebook allows you to:
1. ‚úÖ Upload your local CSV file
2. ‚úÖ Train Gradient Boosting models
3. ‚úÖ Evaluate accuracy metrics
4. ‚úÖ Download trained models for your DejaBrew system

**No Kaggle API setup needed!**

## üì¶ Step 1: Install Dependencies

In [None]:
!pip install scikit-learn pandas numpy joblib matplotlib seaborn -q
print("‚úì Dependencies installed!")

## üì§ Step 2: Upload Your CSV File

Click the **Choose Files** button below and select your `coffee_shop_sales.csv` file.

In [None]:
from google.colab import files
import os

print("üì§ Please upload your coffee_shop_sales.csv file:")
print("   (Click 'Choose Files' below)\n")

uploaded = files.upload()

# Check what was uploaded
uploaded_files = list(uploaded.keys())
print(f"\n‚úÖ Uploaded: {uploaded_files}")

# Find the CSV file
csv_file = None
for filename in uploaded_files:
    if filename.endswith('.csv'):
        csv_file = filename
        break

if csv_file:
    print(f"‚úì Found CSV file: {csv_file}")
    print(f"  File size: {len(uploaded[csv_file]):,} bytes")
    
    # Rename to standard name if needed
    if csv_file != 'coffee_shop_sales.csv':
        os.rename(csv_file, 'coffee_shop_sales.csv')
        csv_file = 'coffee_shop_sales.csv'
        print(f"  Renamed to: coffee_shop_sales.csv")
    
    print("\n‚úÖ Ready to proceed!")
else:
    print("\n‚ùå Error: No CSV file found in upload.")
    print("   Please upload a .csv file and try again.")

## üîç Step 3: Load and Validate Data

In [None]:
import pandas as pd
import numpy as np

print("Loading coffee_shop_sales.csv...\n")

try:
    # Load CSV
    df_raw = pd.read_csv('coffee_shop_sales.csv')
    
    print(f"‚úì CSV loaded successfully!")
    print(f"  Shape: {df_raw.shape}")
    print(f"\nColumns found:")
    for i, col in enumerate(df_raw.columns, 1):
        print(f"  {i}. {col}")
    
    # Check for required columns
    required_cols = ['transaction_date', 'product_detail', 'transaction_qty']
    missing_cols = [col for col in required_cols if col not in df_raw.columns]
    
    if missing_cols:
        print(f"\n‚ö†Ô∏è Warning: Missing expected columns: {missing_cols}")
        print("\nPlease verify your CSV has these columns:")
        print("  - transaction_date (or similar date column)")
        print("  - product_detail (or similar product name column)")
        print("  - transaction_qty (or similar quantity column)")
        print("\nYou may need to adjust the column_mapping in the next step.")
    else:
        print("\n‚úÖ All required columns found!")
    
    print("\nFirst 3 rows:")
    display(df_raw.head(3))
    
except FileNotFoundError:
    print("‚ùå Error: coffee_shop_sales.csv not found.")
    print("   Please run Step 2 to upload your CSV file.")
except Exception as e:
    print(f"‚ùå Error loading CSV: {e}")

## üîß Step 4: Preprocess Data

**Important**: If your columns have different names, update the `column_mapping` dictionary below.

In [None]:
# Column mapping - UPDATE THIS if your CSV has different column names
column_mapping = {
    'transaction_date': 'date',      # Date column
    'product_detail': 'product',     # Product name column
    'transaction_qty': 'quantity'    # Quantity column
}

# Alternative examples (uncomment if needed):
# column_mapping = {'Date': 'date', 'Product': 'product', 'Qty': 'quantity'}
# column_mapping = {'date': 'date', 'item': 'product', 'qty': 'quantity'}

print("Preprocessing data...\n")

try:
    # Apply column mapping
    df = df_raw.rename(columns=column_mapping)
    
    # Keep only needed columns
    df = df[['date', 'product', 'quantity']].copy()
    
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Remove invalid quantities
    df = df[df['quantity'] > 0]
    
    # Clean product names
    df['product'] = df['product'].str.strip()
    
    # Aggregate daily sales per product
    df_daily = df.groupby(['date', 'product'])['quantity'].sum().reset_index()
    
    print("‚úÖ Data preprocessed successfully!")
    print(f"\nDataset Summary:")
    print(f"  Total transactions: {len(df):,}")
    print(f"  Date range: {df['date'].min().date()} to {df['date'].max().date()}")
    print(f"  Total days: {(df['date'].max() - df['date'].min()).days}")
    print(f"  Unique products: {df['product'].nunique()}")
    
    print(f"\nTop 10 products by total sales:")
    top_products = df.groupby('product')['quantity'].sum().sort_values(ascending=False).head(10)
    for i, (product, qty) in enumerate(top_products.items(), 1):
        print(f"  {i}. {product}: {int(qty):,} units")
    
except KeyError as e:
    print(f"‚ùå Error: Column not found: {e}")
    print("\nPlease update the column_mapping dictionary above with the correct column names.")
    print(f"Available columns: {df_raw.columns.tolist()}")
except Exception as e:
    print(f"‚ùå Error preprocessing data: {e}")

## ü§ñ Step 5: Train Models

This will train Gradient Boosting models for the top 30 products.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import os
import json

# Create output directory
os.makedirs('trained_models', exist_ok=True)

print("="*80)
print("  TRAINING GRADIENT BOOSTING MODELS")
print("="*80)

# Pivot to daily format
daily = df_daily.pivot_table(index='date', columns='product', values='quantity', aggfunc='sum').fillna(0)

# Create date features
def create_date_features(df_index):
    features = pd.DataFrame(index=df_index)
    features['day_of_week'] = features.index.dayofweek
    features['month'] = features.index.month
    features['day_of_year'] = features.index.dayofyear
    features['year'] = features.index.year
    return features

X = create_date_features(daily.index)

# Get top 30 products
TOP_N = 30
all_sales = daily.sum().sort_values(ascending=False)
top_products = all_sales.head(TOP_N).index.tolist()

print(f"\nTraining models for top {len(top_products)} products...\n")

trained_list = []
metrics_summary = []

for i, product in enumerate(top_products, 1):
    if product not in daily.columns:
        continue
    
    y = daily[product]
    
    # Check data
    if len(X) < 30:
        print(f"[{i}/{len(top_products)}] ‚äò {product} - Not enough data")
        continue
    
    # Split 80/20
    split = int(len(X) * 0.8)
    X_train, y_train = X.iloc[:split], y.iloc[:split]
    X_test, y_test = X.iloc[split:], y.iloc[split:]
    
    try:
        # Train model
        model = GradientBoostingRegressor(
            n_estimators=200,
            learning_rate=0.05,
            max_depth=5,
            min_samples_split=10,
            min_samples_leaf=4,
            random_state=42
        )
        model.fit(X_train, y_train)
        
        # Evaluate
        y_pred_test = model.predict(X_test)
        test_mae = mean_absolute_error(y_test, y_pred_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        test_r2 = r2_score(y_test, y_pred_test)
        test_mape = np.mean(np.abs((y_test - y_pred_test) / (y_test + 1))) * 100
        test_accuracy = max(0, 100 - test_mape)
        
        # Save model
        safe_name = product.lower().replace(' ', '_').replace('/', '_')
        model_file = f"model_{safe_name}.joblib"
        joblib.dump(model, os.path.join('trained_models', model_file))
        
        trained_list.append(product)
        metrics_summary.append({
            'product': product,
            'accuracy': round(test_accuracy, 2),
            'r2': round(test_r2, 4),
            'mae': round(test_mae, 2),
            'rmse': round(test_rmse, 2)
        })
        
        print(f"[{i}/{len(top_products)}] ‚úì {product}")
        print(f"           Accuracy: {test_accuracy:.2f}% | R¬≤: {test_r2:.4f} | MAE: {test_mae:.2f}")
        
    except Exception as e:
        print(f"[{i}/{len(top_products)}] ‚úó {product} - Error: {e}")

print("\n" + "="*80)
print(f"‚úì Training complete! Successfully trained {len(trained_list)} models.")
print("="*80)

## üìä Step 6: View Accuracy Summary

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

metrics_df = pd.DataFrame(metrics_summary)

print("="*80)
print("MODEL PERFORMANCE SUMMARY")
print("="*80)
print(f"\nTotal models trained: {len(metrics_df)}")
print(f"Average Accuracy: {metrics_df['accuracy'].mean():.2f}%")
print(f"Average R¬≤ Score: {metrics_df['r2'].mean():.4f}")
print(f"Average MAE: {metrics_df['mae'].mean():.2f}")
print(f"Average RMSE: {metrics_df['rmse'].mean():.2f}")
print("\n" + "="*80)

display(metrics_df.sort_values('accuracy', ascending=False))

# Visualize
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

ax[0].hist(metrics_df['accuracy'], bins=15, color='skyblue', edgecolor='black')
ax[0].axvline(metrics_df['accuracy'].mean(), color='red', linestyle='--', linewidth=2, 
              label=f"Mean: {metrics_df['accuracy'].mean():.2f}%")
ax[0].set_xlabel('Accuracy (%)')
ax[0].set_ylabel('Number of Models')
ax[0].set_title('Model Accuracy Distribution', fontweight='bold')
ax[0].legend()
ax[0].grid(alpha=0.3)

ax[1].hist(metrics_df['r2'], bins=15, color='lightgreen', edgecolor='black')
ax[1].axvline(metrics_df['r2'].mean(), color='red', linestyle='--', linewidth=2,
              label=f"Mean: {metrics_df['r2'].mean():.4f}")
ax[1].set_xlabel('R¬≤ Score')
ax[1].set_ylabel('Number of Models')
ax[1].set_title('R¬≤ Score Distribution', fontweight='bold')
ax[1].legend()
ax[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Save metrics
with open('trained_models/model_metrics.json', 'w') as f:
    json.dump({
        'summary': {
            'total_models': len(metrics_df),
            'avg_accuracy': float(metrics_df['accuracy'].mean()),
            'avg_r2': float(metrics_df['r2'].mean()),
            'avg_mae': float(metrics_df['mae'].mean())
        },
        'models': metrics_df.to_dict('records')
    }, f, indent=2)

print("\n‚úì Metrics saved to model_metrics.json")

## üíæ Step 7: Save Trained Products List

In [None]:
# Save trained products list
with open('trained_models/trained_articles.json', 'w') as f:
    json.dump(trained_list, f, indent=2)

print(f"‚úì Saved {len(trained_list)} product names to trained_articles.json")
print("\nTrained products:")
for i, product in enumerate(trained_list, 1):
    print(f"  {i}. {product}")

## üì¶ Step 8: Download Models

This will create a ZIP file with all trained models and download it.

In [None]:
import shutil

# Create ZIP
shutil.make_archive('dejabrew_trained_models', 'zip', 'trained_models')

print("‚úì Created dejabrew_trained_models.zip")
print("\nZIP contents:")
!unzip -l dejabrew_trained_models.zip | head -20

# Download
print("\nüì• Downloading...")
files.download('dejabrew_trained_models.zip')

print("\n" + "="*80)
print("üéâ SUCCESS! Training complete!")
print("="*80)
print("\nNext steps:")
print("1. Extract dejabrew_trained_models.zip")
print("2. Copy all .joblib files to: dejabrew/forecasting/forecasting_data/")
print("3. Copy trained_articles.json to: dejabrew/forecasting/forecasting_data/")
print("4. Copy model_metrics.json to: dejabrew/forecasting/forecasting_data/ (optional)")
print("5. Restart your Django server")
print("6. Test with: python test_forecasting.py")
print("\nModel Performance:")
print(f"  Average Accuracy: {metrics_df['accuracy'].mean():.2f}%")
print(f"  Average R¬≤ Score: {metrics_df['r2'].mean():.4f}")
print(f"  Total Models: {len(trained_list)}")
print("\n" + "="*80)