In [1]:
!pip install nixtlats



In [6]:
df = pd.read_csv("data-8013-trends.csv")

In [7]:
df['BILLING_DATE'] = pd.to_datetime(df['BILLING_DATE'])

df.to_csv( "data-8013-trends.csv")

In [8]:
import pandas as pd
from io import StringIO

# Load your data (assuming it's in CSV format)

# Sort by BILLING_DATE
df_sorted = df.sort_values(by='BILLING_DATE')


In [9]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from nixtlats import TimeGPT
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Initialize TimeGPT client
timegpt = TimeGPT(token='nixak-AI31YupjpWhin07kPGKOhvW5zj8IliWOlTSWLEInpOuKHNWNhtSETsXIFgyEiYT58g3Hk0hvMFhnJpdS')  # Replace with your actual token

def complete_timegpt_pipeline(df_sorted, top_n_products=10):
    """
    Complete end-to-end TimeGPT forecasting pipeline
    """
    print("🚀 STARTING COMPLETE TIMEGPT PIPELINE")
    print("=" * 60)
    
    # Step 1: Data Cleaning and Preparation
    print("\n📋 STEP 1: DATA CLEANING AND PREPARATION")
    cleaned_data = clean_and_prepare_data(df_sorted, top_n_products)
    
    # Step 2: Fix Frequency Issues
    print("\n🔧 STEP 2: FIXING FREQUENCY ISSUES")
    frequency_fixed_data = fix_frequency_issues_complete(cleaned_data)
    
    # Step 3: Train-Test Split
    print("\n✂️ STEP 3: TRAIN-TEST SPLIT")
    train_data, test_data = create_train_test_split(frequency_fixed_data)
    
    # Step 4: TimeGPT Forecasting
    print("\n🤖 STEP 4: TIMEGPT FORECASTING")
    forecasts = run_timegpt_forecasting(train_data, test_periods=7)
    
    # Step 5: Evaluation
    print("\n📊 STEP 5: MODEL EVALUATION")
    evaluation_results = evaluate_forecasts(forecasts, test_data)
    
    # Step 6: Visualization
    print("\n📈 STEP 6: CREATING VISUALIZATIONS")
    create_comprehensive_visualizations(frequency_fixed_data, forecasts, test_data, evaluation_results)
    
    # Step 7: Future Forecasting
    print("\n🔮 STEP 7: FUTURE FORECASTING")
    future_forecasts = generate_future_forecasts(frequency_fixed_data, horizon = 14)
    
    return {
        'cleaned_data': frequency_fixed_data,
        'train_data': train_data,
        'test_data': test_data,
        'forecasts': forecasts,
        'evaluation': evaluation_results,
        'future_forecasts': future_forecasts
    }

def clean_and_prepare_data(df_sorted, top_n_products=10):
    """
    Clean and prepare data for TimeGPT
    """
    print("   • Handling null values...")
    
    # Handle null values in BRAND column
    if df_sorted['BRAND'].isnull().sum() > 0:
        if pd.api.types.is_categorical_dtype(df_sorted['BRAND']):
            df_sorted['BRAND'] = df_sorted['BRAND'].cat.add_categories(['UNKNOWN_BRAND'])
        df_sorted['BRAND'] = df_sorted['BRAND'].fillna('UNKNOWN_BRAND')
    
    # Create PRODUCT_KEY if not exists
    if 'PRODUCT_KEY' not in df_sorted.columns:
        df_sorted['PRODUCT_KEY'] = (
            df_sorted['BRAND'].astype(str) + ' | ' + 
            df_sorted['MH_SEGMENT'].astype(str) + ' | ' + 
            df_sorted['MH_FAMILY'].astype(str) + ' | ' + 
            df_sorted['MH_CLASS'].astype(str) + ' | ' + 
            df_sorted['MH_BRICK'].astype(str)
        )
    
    print("   • Selecting top performing products...")
    
    # Select top products by total sales
    top_products = (df_sorted.groupby('PRODUCT_KEY')['TOTAL_NET_SALES'].sum()
                   .nlargest(top_n_products).index.tolist())
    
    # Filter data for selected products
    filtered_df = df_sorted[df_sorted['PRODUCT_KEY'].isin(top_products)].copy()
    
    print("   • Aggregating daily sales...")
    
    # Aggregate daily sales by product
    daily_sales = filtered_df.groupby(['BILLING_DATE', 'PRODUCT_KEY']).agg({
        'TOTAL_NET_SALES': 'sum',
        'TOTAL_BILLING_QTY': 'sum'
    }).reset_index()
    
    # Convert to TimeGPT format
    timegpt_data = daily_sales.rename(columns={
        'BILLING_DATE': 'ds',
        'TOTAL_NET_SALES': 'y',
        'PRODUCT_KEY': 'unique_id'
    })
    
    # Ensure proper data types
    timegpt_data['ds'] = pd.to_datetime(timegpt_data['ds'])
    timegpt_data['y'] = pd.to_numeric(timegpt_data['y'], errors='coerce')
    
    # Remove any rows with null sales values
    timegpt_data = timegpt_data.dropna(subset=['y'])
    
    print(f"   ✅ Data prepared: {len(timegpt_data)} rows, {timegpt_data['unique_id'].nunique()} products")
    
    return timegpt_data

def fix_frequency_issues_complete(data):
    """
    Comprehensive frequency issues fix
    """
    print("   • Diagnosing frequency issues...")
    
    cleaned_data = []
    successful_products = []
    
    for product_id, group in data.groupby('unique_id'):
        # Sort and remove duplicates
        group = group.sort_values('ds').drop_duplicates(subset=['ds'], keep='first')
        
        # Skip products with insufficient data
        if len(group) < 30:
            continue
            
        # Create complete date range
        date_range = pd.date_range(
            start=group['ds'].min(),
            end=group['ds'].max(),
            freq='D'
        )
        
        # Create complete time series
        complete_ts = pd.DataFrame({
            'ds': date_range,
            'unique_id': product_id
        })
        
        # Merge with actual data
        merged = pd.merge(complete_ts, group, on=['ds', 'unique_id'], how='left')
        
        # Fill missing values with interpolation
        merged['y'] = merged['y'].interpolate(method='linear').fillna(0)
        merged['y'] = merged['y'].clip(lower=0)  # Ensure no negative sales
        
        # Verify frequency can be inferred
        if pd.infer_freq(merged['ds']) == 'D':
            cleaned_data.append(merged)
            successful_products.append(product_id)
    
    if cleaned_data:
        final_data = pd.concat(cleaned_data, ignore_index=True)
        print(f"   ✅ Fixed frequency for {len(successful_products)} products")
        print(f"   📊 Final dataset: {len(final_data)} rows")
        return final_data
    else:
        raise ValueError("No products could be frequency-corrected")

def create_train_test_split(data, test_days=14):
    """
    Create train-test split
    """
    print(f"   • Splitting data with {test_days} test days...")
    
    # Split data by taking last N days for testing
    test_data = data.groupby("unique_id").tail(test_days)
    train_data = (data.groupby("unique_id")
                 .apply(lambda group: group.iloc[:-test_days])
                 .reset_index(drop=True))
    
    print(f"   ✅ Train: {len(train_data)} rows, Test: {len(test_data)} rows")
    
    return train_data, test_data

def run_timegpt_forecasting(train_data, test_periods=7):
    """
    Run TimeGPT forecasting
    """
    print(f"   • Running TimeGPT forecast for {test_periods} periods...")
    
    try:
        forecasts = timegpt.forecast(
            df=train_data,
            h=min(test_periods , 14),
            freq='D',
            level=[80, 90],  # Confidence intervals
            time_col='ds',
            target_col='y',
            id_col='unique_id'
        )
        
        print(f"   ✅ Forecasts generated: {len(forecasts)} rows")
        return forecasts
        
    except Exception as e:
        print(f"   ❌ TimeGPT error: {e}")
        
        # Fallback: Try with fewer products
        print("   🔄 Trying with top 5 products...")
        top_5_products = train_data['unique_id'].value_counts().head(5).index.tolist()
        small_train = train_data[train_data['unique_id'].isin(top_5_products)]
        
        forecasts = timegpt.forecast(
            df=small_train,
            X_df = x_data,
            h=test_periods,
            freq='D',
            level=[80, 90],
            time_col='ds',
            target_col='y',
            id_col='unique_id'
        )
        
        print(f"   ✅ Fallback forecasts generated: {len(forecasts)} rows")
        return forecasts

def evaluate_forecasts(forecasts, test_data):
    """
    Evaluate forecast performance with improved MAPE handling
    """
    print("   • Calculating performance metrics...")
    
    # Fix datetime types
    forecasts['ds'] = pd.to_datetime(forecasts['ds'])
    test_data['ds'] = pd.to_datetime(test_data['ds'])
    
    # Merge forecasts with test data
    evaluation_data = pd.merge(
        test_data,
        forecasts[['ds', 'unique_id', 'TimeGPT']],
        on=['ds', 'unique_id'],
        how='inner'
    )
    
    if len(evaluation_data) == 0:
        print("   ❌ No matching data for evaluation")
        return None
    
    # Check for zero values in actual data
    zero_count = (evaluation_data['y'] == 0).sum()
    near_zero_count = (np.abs(evaluation_data['y']) < 1e-8).sum()
    
    print(f"   📊 Data quality check:")
    print(f"      • Zero values in actual data: {zero_count}")
    print(f"      • Near-zero values: {near_zero_count}")
    
    # Calculate metrics by product
    product_metrics = []
    
    for product_id in evaluation_data['unique_id'].unique():
        product_eval = evaluation_data[evaluation_data['unique_id'] == product_id]
        
        if len(product_eval) > 0:
            # Standard metrics
            mse = mean_squared_error(product_eval['y'], product_eval['TimeGPT'])
            mae = mean_absolute_error(product_eval['y'], product_eval['TimeGPT'])
            rmse = np.sqrt(mse)
            
            # Improved MAPE calculation with zero handling
            actual_values = product_eval['y'].values
            predicted_values = product_eval['TimeGPT'].values
            
            # Method 1: Exclude zero values from MAPE calculation
            non_zero_mask = actual_values != 0
            if non_zero_mask.sum() > 0:
                mape = np.mean(np.abs((actual_values[non_zero_mask] - predicted_values[non_zero_mask]) / actual_values[non_zero_mask])) * 100
                effective_points = non_zero_mask.sum()
            else:
                mape = np.inf  # Still infinity if all values are zero
                effective_points = 0
            
            # Alternative: sMAPE (Symmetric MAPE) - more robust to zeros
            smape = np.mean(2 * np.abs(actual_values - predicted_values) / 
                           (np.abs(actual_values) + np.abs(predicted_values))) * 100
            
            # Alternative: WAPE (Weighted Absolute Percentage Error)
            if actual_values.sum() != 0:
                wape = np.sum(np.abs(actual_values - predicted_values)) / np.sum(np.abs(actual_values)) * 100
            else:
                wape = np.inf
            
            product_metrics.append({
                'product': product_id,
                'mse': mse,
                'mae': mae,
                'rmse': rmse,
                'mape': mape,
                'smape': smape,  # Symmetric MAPE - better for zeros
                'wape': wape,    # Weighted APE - alternative metric
                'data_points': len(product_eval),
                'effective_points': effective_points,  # Points used in MAPE calculation
                'zero_values': (actual_values == 0).sum()
            })
    
    metrics_df = pd.DataFrame(product_metrics)
    
    # Calculate overall statistics using alternative metrics when MAPE is problematic
    finite_mape_mask = np.isfinite(metrics_df['mape'])
    
    if finite_mape_mask.sum() > 0:
        avg_mape = metrics_df.loc[finite_mape_mask, 'mape'].mean()
        mape_note = f"(calculated from {finite_mape_mask.sum()}/{len(metrics_df)} products)"
    else:
        avg_mape = np.inf
        mape_note = "(all products have zero values - use sMAPE instead)"
    
    overall_stats = {
        'avg_mse': metrics_df['mse'].mean(),
        'avg_mae': metrics_df['mae'].mean(),
        'avg_rmse': metrics_df['rmse'].mean(),
        'avg_mape': avg_mape,
        'avg_smape': metrics_df['smape'].mean(),  # More reliable alternative
        'avg_wape': metrics_df[np.isfinite(metrics_df['wape'])]['wape'].mean(),
        'total_products': len(metrics_df),
        'total_points': len(evaluation_data),
        'products_with_zeros': (metrics_df['zero_values'] > 0).sum()
    }
    
    print(f"   ✅ Evaluation complete: {len(metrics_df)} products evaluated")
    print(f"   📊 Average MAE: {overall_stats['avg_mae']:.2f}")
    print(f"   📊 Average RMSE: {overall_stats['avg_rmse']:.2f}")
    print(f"   📊 Average sMAPE: {overall_stats['avg_smape']:.2f}% (recommended)")
    print(f"   📊 Average MAPE: {overall_stats['avg_mape']:.2f}% {mape_note}")
    print(f"   ⚠️  Products with zero values: {overall_stats['products_with_zeros']}")
    
    return {
        'product_metrics': metrics_df,
        'overall_stats': overall_stats,
        'evaluation_data': evaluation_data
    }


def create_comprehensive_visualizations(full_data, forecasts, test_data, evaluation_results):
    """
    Create comprehensive visualizations using TimeGPT's built-in plotting
    """
    print("   • Creating TimeGPT visualization...")
    
    if evaluation_results is None:
        print("   ❌ No evaluation data for visualization")
        return
    
    # Use TimeGPT's built-in plotting function
    try:
        # Plot using TimeGPT's native plotting with zoomed view
        timegpt.plot(
            test_data,
            forecasts,
            models=["TimeGPT"],
            level=[90],
            time_col="ds",
            target_col="y",
            id_col="unique_id",
            max_insample_length=60  # Show last 60 days for context
        )
        
        print("   ✅ TimeGPT native plot created")
        
    except Exception as e:
        print(f"   ⚠️ TimeGPT plot failed: {e}")
        print("   🔄 Creating custom matplotlib visualization...")
        
        # Fallback to custom matplotlib plot
        create_custom_visualization(full_data, forecasts, test_data, evaluation_results)

def create_custom_visualization(full_data, forecasts, test_data, evaluation_results):
    """
    Create custom matplotlib visualization
    """
    evaluation_data = evaluation_results['evaluation_data']
    product_metrics = evaluation_results['product_metrics']
    
    # Get top 3 products for plotting
    top_products = product_metrics.nsmallest(3, 'mape')['product'].tolist()
    
    fig, axes = plt.subplots(len(top_products), 1, figsize=(15, 5*len(top_products)))
    if len(top_products) == 1:
        axes = [axes]
    
    for idx, product_id in enumerate(top_products):
        # Get data for this product
        product_train = full_data[full_data['unique_id'] == product_id]
        product_test = evaluation_data[evaluation_data['unique_id'] == product_id]
        product_pred = forecasts[forecasts['unique_id'] == product_id]
        
        # Get metrics
        metrics = product_metrics[product_metrics['product'] == product_id].iloc[0]
        
        # Find split point
        if len(product_test) > 0:
            test_start = product_test['ds'].min()
            train_plot = product_train[product_train['ds'] >= test_start - pd.Timedelta(days=30)]
        
        # Plot training data (last 30 days)
        axes[idx].plot(train_plot['ds'], train_plot['y'], 'b-', linewidth=2, 
                      label='Training Data', alpha=0.7)
        
        # Plot actual test data
        axes[idx].plot(product_test['ds'], product_test['y'], 'go-', linewidth=3, 
                      markersize=8, label='Actual Test Data')
        
        # Plot predictions
        axes[idx].plot(product_pred['ds'], product_pred['TimeGPT'], 'r--', linewidth=3, 
                      marker='s', markersize=8, label='TimeGPT Predictions')
        
        # Add confidence intervals if available
        if 'TimeGPT-lo-90' in product_pred.columns:
            axes[idx].fill_between(product_pred['ds'], 
                                 product_pred['TimeGPT-lo-90'], 
                                 product_pred['TimeGPT-hi-90'], 
                                 color='red', alpha=0.2, label='90% Confidence')
        
        # Formatting
        title = f'Product: {product_id[:50]}...\n'
        title += f'MAPE: {metrics["mape"]:.1f}% | MAE: {metrics["mae"]:.1f}'
        
        axes[idx].set_title(title, fontsize=12, fontweight='bold')
        axes[idx].set_xlabel('Date')
        axes[idx].set_ylabel('Sales')
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)
        
        # Format dates
        axes[idx].xaxis.set_major_formatter(mdates.DateFormatter('%m/%d'))
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)
    
    plt.suptitle('🤖 TimeGPT Forecasting Results', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("   ✅ Custom visualization created")

def generate_future_forecasts(data, horizon=14):
    """
    Generate future forecasts
    """
    print(f"   • Generating {horizon}-day future forecasts...")
    
    try:
        future_forecasts = timegpt.forecast(
            df=data,
            h=horizon,
            freq='D',
            level=[80, 90],
            time_col='ds',
            target_col='y',
            id_col='unique_id'
        )
        
        # Create summary
        forecast_summary = future_forecasts.groupby('unique_id').agg({
            'TimeGPT': ['mean', 'sum']
        }).round(2)
        
        forecast_summary.columns = ['avg_daily', 'total_forecast']
        forecast_summary = forecast_summary.sort_values('total_forecast', ascending=False)
        
        print(f"   ✅ Future forecasts generated")
        print(f"   📊 Top 5 products by projected sales:")
        print(forecast_summary.head())
        
        return future_forecasts
        
    except Exception as e:
        print(f"   ❌ Future forecast error: {e}")
        return None

# Main execution function
def run_complete_pipeline(df_sorted):
    """
    Run the complete TimeGPT pipeline
    """
    try:
        results = complete_timegpt_pipeline(df_sorted, top_n_products=10)
        
        print("\n🎉 PIPELINE COMPLETED SUCCESSFULLY!")
        print("=" * 60)
        print("📋 Results Summary:")
        print(f"   • Products processed: {results['cleaned_data']['unique_id'].nunique()}")
        print(f"   • Training data points: {len(results['train_data'])}")
        print(f"   • Test data points: {len(results['test_data'])}")
        
        if results['evaluation']:
            print(f"   • Average MAPE: {results['evaluation']['overall_stats']['avg_mape']:.2f}%")
        
        return results
        
    except Exception as e:
        print(f"❌ Pipeline failed: {e}")
        return None

results = run_complete_pipeline(df_sorted)


🚀 STARTING COMPLETE TIMEGPT PIPELINE

📋 STEP 1: DATA CLEANING AND PREPARATION
   • Handling null values...
   • Selecting top performing products...
   • Aggregating daily sales...


INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...


   ✅ Data prepared: 7793 rows, 10 products

🔧 STEP 2: FIXING FREQUENCY ISSUES
   • Diagnosing frequency issues...
   ✅ Fixed frequency for 10 products
   📊 Final dataset: 7918 rows

✂️ STEP 3: TRAIN-TEST SPLIT
   • Splitting data with 14 test days...
   ✅ Train: 7778 rows, Test: 140 rows

🤖 STEP 4: TIMEGPT FORECASTING
   • Running TimeGPT forecast for 7 periods...


INFO:nixtlats.nixtla_client:Restricting input...
INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...
INFO:nixtlats.nixtla_client:Validating inputs...
INFO:nixtlats.nixtla_client:Preprocessing dataframes...


   ✅ Forecasts generated: 70 rows

📊 STEP 5: MODEL EVALUATION
   • Calculating performance metrics...
   📊 Data quality check:
      • Zero values in actual data: 1
      • Near-zero values: 1
   ✅ Evaluation complete: 10 products evaluated
   📊 Average MAE: 5055.10
   📊 Average RMSE: 6131.99
   📊 Average sMAPE: 32.00% (recommended)
   📊 Average MAPE: 41.50% (calculated from 10/10 products)
   ⚠️  Products with zero values: 1

📈 STEP 6: CREATING VISUALIZATIONS
   • Creating TimeGPT visualization...
   ✅ TimeGPT native plot created

🔮 STEP 7: FUTURE FORECASTING
   • Generating 14-day future forecasts...


INFO:nixtlats.nixtla_client:Restricting input...
INFO:nixtlats.nixtla_client:Calling Forecast Endpoint...


   ✅ Future forecasts generated
   📊 Top 5 products by projected sales:
                                                    avg_daily  total_forecast
unique_id                                                                    
F085 | WOMENS WEAR | ETHNIC WEAR | TOPWEAR | KU...   69931.85       979045.94
04042 | MENS WEAR | SMART CASUALS | TOPS | SHIRTS    37320.84       522491.72
04063 | MENS CASUAL | ACTIVE WEAR | BOTTOMS | T...   19227.32       269182.53
04042 | MENS WEAR | SMART CASUALS | BOTTOMS | T...   14836.29       207708.04
04063 | MENS CASUAL | ACTIVE WEAR | TOPS | T SH...   14433.36       202067.07

🎉 PIPELINE COMPLETED SUCCESSFULLY!
📋 Results Summary:
   • Products processed: 10
   • Training data points: 7778
   • Test data points: 140
   • Average MAPE: 41.50%


In [3]:
if results:
    cleaned_data = results['cleaned_data']
    forecasts = results['forecasts']
    evaluation = results['evaluation']
    future_forecasts = results['future_forecasts']
    
    # Print performance summary
    if evaluation:
        print(f"Model Performance: {evaluation['overall_stats']['avg_mape']:.1f}% MAPE")

NameError: name 'results' is not defined