# **Drive Access**

In [None]:
# Mount Google Drive and set paths
from google.colab import drive
drive.mount('/content/drive')

FOLDERNAME = '/content/drive/MyDrive/ML_final_project'
assert FOLDERNAME is not None, "[!] Enter the foldername."
DATAPATH = f'{FOLDERNAME}/data/'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import shift
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")
!pip install dagshub mlflow prophet -q

In [None]:
import dagshub
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature
from prophet import Prophet

# **MLflow Setup**

In [None]:
dagshub.init(repo_owner='kechik21', repo_name='ML_Final_Project', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/kechik21/ML_Final_Project.mlflow')

experiment_name = "Prophet_Training"
try:
    experiment_id = mlflow.create_experiment(experiment_name)
    print(f"Created new experiment: {experiment_name}")
except mlflow.exceptions.MlflowException:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id
    print(f"Using existing experiment: {experiment_name}")

mlflow.set_experiment(experiment_name)

print(" MLflow setup complete!")
print(" Your experiments will be visible at:")
print("   https://dagshub.com/kechik21/ML_Final_Project")
print(f" Current experiment: {experiment_name}")

# **Data Loading and Exploration**

In [None]:
with mlflow.start_run(run_name="Prophet_Data_Loading"):
    print("Starting Data Loading and Initial Exploration...")

    #Load
    print("Loading datasets...")
    train_df = pd.read_csv(DATAPATH + 'train.csv')
    test_df = pd.read_csv(DATAPATH + 'test.csv')
    features_df = pd.read_csv(DATAPATH + 'features.csv')
    stores_df = pd.read_csv(DATAPATH + 'stores.csv')


    mlflow.log_param("train_shape", train_df.shape)
    mlflow.log_param("test_shape", test_df.shape)
    mlflow.log_param("features_shape", features_df.shape)
    mlflow.log_param("stores_shape", stores_df.shape)

    mlflow.log_param("unique_stores", train_df['Store'].nunique())
    mlflow.log_param("unique_departments", train_df['Dept'].nunique())
    mlflow.log_param("date_range_train", f"{train_df['Date'].min()} to {train_df['Date'].max()}")
    mlflow.log_param("date_range_test", f"{test_df['Date'].min()} to {test_df['Date'].max()}")

    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
    print(f"Number of stores: {train_df['Store'].nunique()}")
    print(f"Number of departments: {train_df['Dept'].nunique()}")

    mlflow.log_text("Data loading completed successfully", "data_loading_status.txt")

# **Data Processing**

In [None]:
with mlflow.start_run(run_name="Prophet_Data_Preprocessing"):
    print("\nStarting Data Preprocessing...")


    merged_1 = train_df.merge(stores_df, on=['Store'], how='left')
    train = merged_1.merge(features_df, on=['Store','Date','IsHoliday'], how='left')

    merged_test = test_df.merge(stores_df, on=['Store'], how='left')
    test = merged_test.merge(features_df, on=['Store','Date','IsHoliday'], how='left')

    train['sales'] = train['Weekly_Sales']
    train = train.drop('Weekly_Sales', axis=1)

    mlflow.log_param("merge_strategy", "left_join")
    mlflow.log_param("sales_column_renamed", True)
    mlflow.log_param("missing_values_before", train.isnull().sum().sum())

    train = train.fillna(method='ffill').fillna(method='bfill')
    test = test.fillna(method='ffill').fillna(method='bfill')

    mlflow.log_param("missing_values_after", train.isnull().sum().sum())
    mlflow.log_param("preprocessing_complete", True)

    print("Data preprocessing completed")
    print(f"Final train shape: {train.shape}")
    print(f"Final test shape: {test.shape}")

# **Model Development and Validation**

In [None]:
with mlflow.start_run(run_name="Prophet_Model_Development"):
    print("\nStarting Prophet Model Development...")


    SAMPLE_STORE = 4
    SAMPLE_DEPT = 14

    mlflow.log_param("sample_store", SAMPLE_STORE)
    mlflow.log_param("sample_department", SAMPLE_DEPT)
    sample_data = train.loc[(train['Store'] == SAMPLE_STORE) & (train['Dept'] == SAMPLE_DEPT)]
    prophet_data = sample_data[['sales', 'Date']].copy()
    prophet_data['ds'] = pd.to_datetime(prophet_data['Date'])
    prophet_data['y'] = prophet_data['sales']
    prophet_data = prophet_data[['ds', 'y']].sort_values('ds')

    mlflow.log_param("sample_data_points", len(prophet_data))
    mlflow.log_param("date_range_sample", f"{prophet_data['ds'].min()} to {prophet_data['ds'].max()}")

    print(f"Sample data shape: {prophet_data.shape}")
    print(f"Date range: {prophet_data['ds'].min()} to {prophet_data['ds'].max()}")

    #80/20 splitting
    split_idx = int(len(prophet_data) * 0.8)
    train_prophet = prophet_data[:split_idx].copy()
    val_prophet = prophet_data[split_idx:].copy()

    mlflow.log_param("train_split_ratio", 0.8)
    mlflow.log_param("train_points", len(train_prophet))
    mlflow.log_param("validation_points", len(val_prophet))


    prophet_params = {
        'weekly_seasonality': True,
        'daily_seasonality': True,
        'yearly_seasonality': 'auto',
        'seasonality_mode': 'additive',
        'interval_width': 0.95
    }

    mlflow.log_params(prophet_params)


    print("Training Prophet model...")
    model = Prophet(**prophet_params)
    model.fit(train_prophet)
    val_dates = val_prophet[['ds']].copy()
    val_forecast = model.predict(val_dates)

    #validation stuff
    val_rmse = np.sqrt(mse(val_prophet['y'], val_forecast['yhat']))
    val_mae = mae(val_prophet['y'], val_forecast['yhat'])

    #MAPE (0 ze gayopaze kide tu mokvdeba, mivyvebi mec)
    actual_values = val_prophet['y'].values
    predicted_values = val_forecast['yhat'].values

    # for only non-zero !!!!!!!!!!!!!
    non_zero_mask = actual_values != 0
    if np.sum(non_zero_mask) > 0:
        val_mape = np.mean(np.abs((actual_values[non_zero_mask] - predicted_values[non_zero_mask]) / actual_values[non_zero_mask])) * 100
    else:
        val_mape = float('inf')


    mlflow.log_metric("validation_rmse", val_rmse)
    mlflow.log_metric("validation_mae", val_mae)
    mlflow.log_metric("validation_mape", val_mape)

    print(f"Validation RMSE: {val_rmse:.2f}")
    print(f"Validation MAE: {val_mae:.2f}")
    print(f"Validation MAPE: {val_mape:.2f}%")

# **CV & Hyperparams**

In [None]:
with mlflow.start_run(run_name="Prophet_Cross_Validation"):
    print("\nStarting Cross-Validation and Hyperparameter Tuning...")

    configurations = [
        {'weekly_seasonality': True, 'daily_seasonality': False, 'yearly_seasonality': 'auto'},
        {'weekly_seasonality': True, 'daily_seasonality': True, 'yearly_seasonality': 'auto'},
        {'weekly_seasonality': 15, 'daily_seasonality': False, 'yearly_seasonality': 'auto'},
        {'weekly_seasonality': True, 'daily_seasonality': True, 'yearly_seasonality': 15}
    ]

    best_rmse = float('inf')
    best_config = None
    cv_results = []

    for i, config in enumerate(configurations):
        print(f"Testing configuration {i+1}/{len(configurations)}: {config}")

        with mlflow.start_run(run_name=f"Prophet_Config_{i+1}", nested=True):
            mlflow.log_params(config)

            model_cv = Prophet(**config)
            model_cv.fit(train_prophet)
            forecast_cv = model_cv.predict(val_dates)


            rmse_cv = np.sqrt(mse(val_prophet['y'], forecast_cv['yhat']))
            mae_cv = mae(val_prophet['y'], forecast_cv['yhat'])

            mlflow.log_metric("cv_rmse", rmse_cv)
            mlflow.log_metric("cv_mae", mae_cv)

            cv_results.append({
                'config': config,
                'rmse': rmse_cv,
                'mae': mae_cv
            })

            if rmse_cv < best_rmse:
                best_rmse = rmse_cv
                best_config = config

            print(f"  RMSE: {rmse_cv:.2f}, MAE: {mae_cv:.2f}")

    # SAUKETESO CONFIG LOGGING
    mlflow.log_params({"best_" + k: v for k, v in best_config.items()})
    mlflow.log_metric("best_cv_rmse", best_rmse)

    print(f"\nBest configuration: {best_config}")
    print(f"Best RMSE: {best_rmse:.2f}")

# **Training and full Pipeline**

In [None]:

print("\n" + "="*60)
print(" FULL PROPHET TRAINING - ALL STORE-DEPARTMENT COMBINATIONS")
print("="*60)

with mlflow.start_run(run_name="Prophet_Full_Real_Training"):
    print("Starting Full Prophet Training for ALL store-department combinations...")


    all_stores = sorted(train['Store'].unique())
    all_depts = sorted(train['Dept'].unique())
    total_combinations = len(all_stores) * len(all_depts)

    print(f"Training scope: {len(all_stores)} stores × {len(all_depts)} departments")
    print(f"Total models to train: {total_combinations}")
    print("This will train Prophet models for ALL store-department combinations")
    print("="*60)

    mlflow.log_param("training_mode", "FULL")
    mlflow.log_param("num_stores", len(all_stores))
    mlflow.log_param("num_departments", len(all_depts))
    mlflow.log_param("total_models", total_combinations)
    mlflow.log_param("prophet_config", best_config)



    # P i p e l i n e
    class RealProphetPipeline:
        def __init__(self, prophet_params):
            self.prophet_params = prophet_params
            self.models = {}
            self.training_stats = {
                'models_trained': 0,
                'models_failed': 0,
                'total_training_time': 0,
                'failed_combinations': []
            }

        def fit(self, train_data, stores_to_train, depts_to_train):

            import time

            total_combinations = len(stores_to_train) * len(depts_to_train)
            start_time = time.time()

            print(f" Training {total_combinations} Prophet models...")

            for i, store in enumerate(stores_to_train):
                store_start_time = time.time()
                store_models_trained = 0
                store_models_failed = 0

                for j, dept in enumerate(depts_to_train):
                    model_start_time = time.time()

                    try:
                       # store-department combo
                        mask = (train_data['Store'] == store) & (train_data['Dept'] == dept)
                        data = train_data.loc[mask, ['sales', 'Date']].copy()

                        #if we have enough data points
                        if len(data) < 10:
                            print(f"XXXXXXX  Skipping Store {store}, Dept {dept}: insufficient data ({len(data)} points)")
                            self.training_stats['failed_combinations'].append((store, dept, "insufficient_data"))
                            self.training_stats['models_failed'] += 1
                            continue

                        #data format
                        prophet_df = pd.DataFrame({
                            'ds': pd.to_datetime(data['Date']),
                            'y': data['sales']
                        }).sort_values('ds').reset_index(drop=True)

                        # Remove any duplicate dates (Prophet requirement)
                        prophet_df = prophet_df.drop_duplicates('ds').reset_index(drop=True)

                        #training
                        model = Prophet(**self.prophet_params)

                        #decrease verbose output
                        import logging
                        logging.getLogger('prophet').setLevel(logging.WARNING)

                        model.fit(prophet_df)


                        self.models[(store, dept)] = {
                            'model': model,
                            'training_data_points': len(prophet_df),
                            'date_range': (prophet_df['ds'].min(), prophet_df['ds'].max())
                        }

                        self.training_stats['models_trained'] += 1
                        store_models_trained += 1

                        model_time = time.time() - model_start_time

                    except Exception as e:
                        print(f"X Failed Store {store}, Dept {dept}: {str(e)}")
                        self.training_stats['failed_combinations'].append((store, dept, str(e)))
                        self.training_stats['models_failed'] += 1
                        store_models_failed += 1


                store_time = time.time() - store_start_time
                elapsed_total = time.time() - start_time
                progress = ((i + 1) / len(stores_to_train)) * 100

                print(f" Store {store} complete: {store_models_trained} trained, {store_models_failed} failed")
                print(f"   Progress: {progress:.1f}% | Store time: {store_time:.1f}s | Total elapsed: {elapsed_total/60:.1f}min")


                if i > 0:
                    avg_time_per_store = elapsed_total / (i + 1)
                    remaining_stores = len(stores_to_train) - (i + 1)
                    eta_minutes = (remaining_stores * avg_time_per_store) / 60
                    print(f"   ETA: {eta_minutes:.1f} minutes remaining")

                print("-" * 50)

            self.training_stats['total_training_time'] = time.time() - start_time

            print(f"\n TRAINING COMPLETE!")
            print(f" Models trained: {self.training_stats['models_trained']}")
            print(f" Models failed: {self.training_stats['models_failed']}")
            print(f" Total training time: {self.training_stats['total_training_time']/60:.1f} minutes")
            print(f" Success rate: {(self.training_stats['models_trained']/total_combinations)*100:.1f}%")

            return self

        def predict(self, test_data):

            print("\n Generating predictions...")

            predictions = []
            prediction_stats = {'used_model': 0, 'used_fallback': 0}

            for idx, row in test_data.iterrows():
                store, dept, date = row['Store'], row['Dept'], row['Date']

                if (store, dept) in self.models:

                    model_info = self.models[(store, dept)]
                    model = model_info['model']
                    future_df = pd.DataFrame({'ds': [pd.to_datetime(date)]})

                    try:
                        forecast = model.predict(future_df)
                        pred = forecast['yhat'].iloc[0]
                        prediction_stats['used_model'] += 1
                    except Exception as e:
                        pred = self._fallback_prediction(store, dept, date, test_data)
                        prediction_stats['used_fallback'] += 1
                else:
                    pred = self._fallback_prediction(store, dept, date, test_data)
                    prediction_stats['used_fallback'] += 1
                predictions.append(max(0, pred))


                if (idx + 1) % 10000 == 0:
                    print(f"   Predicted {idx + 1}/{len(test_data)} samples...")

            print(f"   Predictions complete!")
            print(f"   Used trained models: {prediction_stats['used_model']}")
            print(f"   Used fallback: {prediction_stats['used_fallback']}")

            return np.array(predictions)

        def _fallback_prediction(self, store, dept, date, test_data):

            store_mask = test_data['Store'] == store
            dept_mask = test_data['Dept'] == dept

            if store_mask.any():
                return 1000.0
            elif dept_mask.any():
                return 800.0
            else:
                return 500.0




   # ACTUAL TRAINING HAPPENS HERE
    pipeline = RealProphetPipeline(best_config)


    print(" Starting real Prophet training...")
    pipeline.fit(train, all_stores, all_depts)


    mlflow.log_metric("models_trained", pipeline.training_stats['models_trained'])
    mlflow.log_metric("models_failed", pipeline.training_stats['models_failed'])
    mlflow.log_metric("training_time_minutes", pipeline.training_stats['total_training_time']/60)
    mlflow.log_metric("success_rate", (pipeline.training_stats['models_trained']/total_combinations)*100)

   # PREDICTIONS


    test_subset = test[['Date', 'Store', 'Dept']].copy()
    final_predictions = pipeline.predict(test_subset)

    predictions_path = f'{FOLDERNAME}/prophet_real_predictions.npy'
    np.save(predictions_path, final_predictions)

    mlflow.log_artifact(predictions_path, "predictions")

    pred_mean = np.mean(final_predictions)
    pred_std = np.std(final_predictions)
    pred_min = np.min(final_predictions)
    pred_max = np.max(final_predictions)

    mlflow.log_metric("predictions_mean", pred_mean)
    mlflow.log_metric("predictions_std", pred_std)
    mlflow.log_metric("predictions_min", pred_min)
    mlflow.log_metric("predictions_max", pred_max)
    mlflow.log_metric("total_predictions", len(final_predictions))

    print(f"\n PREDICTION STATISTICS:")
    print(f"   Mean prediction: ${pred_mean:.2f}")
    print(f"   Std deviation: ${pred_std:.2f}")
    print(f"   Min prediction: ${pred_min:.2f}")
    print(f"   Max prediction: ${pred_max:.2f}")
    print(f"   Total predictions: {len(final_predictions)}")

    #submission creation


    submission = pd.read_csv(DATAPATH + 'sampleSubmission.csv')
    submission['Weekly_Sales'] = final_predictions


    submission_path = f'{FOLDERNAME}/prophet_real_submission.csv'
    submission.to_csv(submission_path, index=False)


    mlflow.log_artifact(submission_path, "submission")

    print(f"\n FILES SAVED:")
    print(f"   Predictions: {predictions_path}")
    print(f"   Submission: {submission_path}")

    print("\n" + "="*60)
    print(" FULL PROPHET TRAINING COMPLETED SUCCESSFULLY!")
    print("="*60)
    print(f" Trained models: {pipeline.training_stats['models_trained']} out of {total_combinations}")
    print(f" Training time: {pipeline.training_stats['total_training_time']/60:.1f} minutes")
    print(f" Success rate: {(pipeline.training_stats['models_trained']/total_combinations)*100:.1f}%")
    print(f" Predictions generated: {len(final_predictions)}")
    print(f" Submission file ready for Kaggle")
    print("="*60)