# Electricity Theft Detection - Training with Real SGCC Data

This notebook trains the electricity theft detection model using real SGCC dataset.
**IMPORTANT**: Upload your 'datasetsmall.csv' file to Colab before running this notebook.


## 1. Setup Environment

In [None]:
# Install required packages
!pip install --upgrade pip
!pip install scikit-learn==1.3.2 xgboost==2.0.2 pandas==2.1.4 numpy==1.24.4
!pip install tsfresh==0.20.2 imbalanced-learn==0.11.0 shap==0.43.0
!pip install scipy==1.11.4 statsmodels==0.14.1 loguru==0.7.2
!pip install joblib

In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings
from pathlib import Path
from typing import Tuple, Dict, Optional, List
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from scipy import stats
import xgboost as xgb
from google.colab import files
import os

warnings.filterwarnings('ignore')

# Create directories
os.makedirs('models', exist_ok=True)
os.makedirs('data', exist_ok=True)

print("Environment setup complete!")

## 2. Upload Dataset
**Please upload your 'datasetsmall.csv' file using the file upload button below**

In [None]:
# Upload dataset file
print("Please upload your SGCC dataset file (datasetsmall.csv)")
uploaded = files.upload()

# Check if file was uploaded
if uploaded:
    print(f"Successfully uploaded: {list(uploaded.keys())}")
else:
    print("No file uploaded. Please upload datasetsmall.csv and re-run this cell.")

## 3. Data Loading and Preprocessing Classes

In [None]:
class SGCCDataLoader:
    """
    SGCC Dataset Loader for Electricity Theft Detection
    """
    
    def __init__(self, data_path: Optional[str] = None):
        self.data_path = Path(data_path) if data_path else Path("data")
        self.data_path.mkdir(parents=True, exist_ok=True)
        
    def load_real_sgcc_data(self) -> pd.DataFrame:
        """
        Load real SGCC dataset from uploaded file
        """
        try:
            print("Loading real SGCC dataset...")
            
            # Try to find the uploaded dataset file
            dataset_files = ['datasetsmall.csv', 'dataset.csv', 'sgcc_data.csv']
            df_raw = None
            
            for filename in dataset_files:
                try:
                    df_raw = pd.read_csv(filename, low_memory=False)
                    print(f"Successfully loaded dataset from {filename}")
                    break
                except FileNotFoundError:
                    continue
            
            if df_raw is None:
                print("Dataset file not found. Please upload your SGCC dataset file.")
                print("Expected format: Wide format with date columns and CONS_NO, FLAG columns")
                return None
            
            # Validate dataset format
            if 'CONS_NO' not in df_raw.columns or 'FLAG' not in df_raw.columns:
                raise ValueError("Dataset must contain 'CONS_NO' and 'FLAG' columns")
            
            # Get dataset statistics
            total_meters = len(df_raw)
            theft_meters = df_raw['FLAG'].sum()
            normal_meters = total_meters - theft_meters
            theft_rate = (theft_meters / total_meters) * 100
            
            print(f"Dataset loaded successfully:")
            print(f"  - Total meters: {total_meters:,}")
            print(f"  - Normal meters: {normal_meters:,} ({100-theft_rate:.1f}%)")
            print(f"  - Theft meters: {theft_meters:,} ({theft_rate:.1f}%)")
            print(f"  - Date columns: {len(df_raw.columns) - 2}")
            
            return df_raw
            
        except Exception as e:
            print(f"Failed to load real SGCC dataset: {e}")
            return None
    
    def convert_wide_to_long(self, df_raw: pd.DataFrame) -> pd.DataFrame:
        """
        Convert SGCC dataset from wide format to long format
        """
        print("Converting wide format to long format...")
        
        # Get date columns (all except last 2 which are CONS_NO and FLAG)
        date_columns = df_raw.columns[:-2].tolist()
        print(f"Found {len(date_columns)} date columns")
        
        # Create a copy and rename identifier columns
        df_work = df_raw.copy()
        df_work = df_work.rename(columns={'CONS_NO': 'meter_id', 'FLAG': 'label'})
        
        # Melt the dataframe to convert from wide to long format
        df_long = pd.melt(
            df_work,
            id_vars=['meter_id', 'label'],
            value_vars=date_columns,
            var_name='date',
            value_name='consumption'
        )
        
        print(f"Melted to {len(df_long):,} records")
        
        # Convert date strings to datetime
        print("Parsing date columns...")
        
        def parse_date_column(date_str):
            try:
                if '/' in str(date_str):
                    return pd.to_datetime(date_str, format='%m/%d/%Y', errors='coerce')
                else:
                    return pd.to_datetime(date_str, errors='coerce')
            except:
                return pd.NaT
        
        df_long['date'] = df_long['date'].apply(parse_date_column)
        
        # Handle any dates that couldn't be parsed
        failed_dates_mask = df_long['date'].isna()
        if failed_dates_mask.sum() > 0:
            print(f"Warning: Could not parse {failed_dates_mask.sum()} date entries, creating sequential dates")
            
            # For unparseable dates, create sequential dates starting from 2014-01-01
            start_date = pd.to_datetime('2014-01-01')
            
            # Create mapping from date column names to actual dates
            date_mapping = {}
            for i, col in enumerate(date_columns):
                date_mapping[col] = start_date + pd.Timedelta(days=i)
            
            # Apply mapping to failed dates
            for idx in df_long[failed_dates_mask].index:
                original_date_col = df_long.loc[idx, 'date'] if not pd.isna(df_long.loc[idx, 'date']) else None
                # Map back to original column
                meter_id = df_long.loc[idx, 'meter_id']
                meter_rows = df_long[df_long['meter_id'] == meter_id]
                position = list(meter_rows.index).index(idx)
                
                if position < len(date_columns):
                    original_col = date_columns[position]
                    df_long.loc[idx, 'date'] = date_mapping[original_col]
        
        # Convert consumption to numeric
        print("Converting consumption values to numeric...")
        df_long['consumption'] = pd.to_numeric(df_long['consumption'], errors='coerce')
        
        # Count zero vs missing consumption
        zero_consumption = (df_long['consumption'] == 0).sum()
        missing_consumption = df_long['consumption'].isna().sum()
        print(f"Zero consumption readings: {zero_consumption:,}")
        print(f"Missing consumption readings: {missing_consumption:,}")
        
        # Remove rows with missing (NaN) consumption values, but keep zeros
        initial_len = len(df_long)
        df_long = df_long.dropna(subset=['consumption'])
        removed = initial_len - len(df_long)
        if removed > 0:
            print(f"Removed {removed:,} rows with missing consumption values")
        
        # Sort by meter_id and date
        df_long = df_long.sort_values(['meter_id', 'date']).reset_index(drop=True)
        
        print(f"Final dataset: {len(df_long):,} records")
        print(f"Date range: {df_long['date'].min()} to {df_long['date'].max()}")
        print(f"Unique meters: {df_long['meter_id'].nunique():,}")
        print(f"Consumption range: {df_long['consumption'].min():.2f} to {df_long['consumption'].max():.2f}")
        
        return df_long
    
    def load_dataset(self) -> pd.DataFrame:
        """Load dataset - uses real SGCC data"""
        # Load real SGCC data
        df_raw = self.load_real_sgcc_data()
        if df_raw is None:
            raise ValueError("Could not load SGCC dataset. Please upload datasetsmall.csv to Colab.")
        
        # Convert to long format
        df_long = self.convert_wide_to_long(df_raw)
        
        print(f"Dataset ready: {len(df_long)} records, {df_long['meter_id'].nunique()} unique meters")
        return df_long

In [None]:
class ElectricityDataPreprocessor:
    """
    Data preprocessing pipeline for electricity consumption data
    """
    
    def __init__(self):
        self.scaler = None
        self.preprocessing_stats = {}
        
    def handle_missing_values(self, df: pd.DataFrame, method: str = 'linear') -> pd.DataFrame:
        """Handle missing values in consumption data"""
        print(f"Handling missing values using {method} method...")
        
        df_processed = df.copy()
        initial_missing = df_processed['consumption'].isnull().sum()
        
        if initial_missing == 0:
            print("No missing values found")
            return df_processed
        
        print(f"Found {initial_missing} missing values ({initial_missing/len(df)*100:.2f}%)")
        
        if method == 'linear':
            df_processed = df_processed.sort_values(['meter_id', 'date'])
            df_processed['consumption'] = df_processed.groupby('meter_id')['consumption'].transform(
                lambda x: x.interpolate(method='linear', limit_direction='both')
            )
            
        final_missing = df_processed['consumption'].isnull().sum()
        print(f"Missing values reduced from {initial_missing} to {final_missing}")
        
        return df_processed
    
    def detect_and_remove_outliers(self, df: pd.DataFrame, method: str = 'zscore') -> pd.DataFrame:
        """Detect and handle outliers"""
        print(f"Detecting outliers using {method} method...")
        
        df_processed = df.copy()
        outliers_processed = 0
        
        if method == 'zscore':
            threshold = 3.0
            for meter_id in df_processed['meter_id'].unique():
                meter_data = df_processed[df_processed['meter_id'] == meter_id]['consumption']
                
                if len(meter_data) > 10:  # Need sufficient data points
                    z_scores = np.abs(stats.zscore(meter_data.dropna()))
                    outlier_mask = z_scores > threshold
                    
                    if outlier_mask.any():
                        mean_val = meter_data.mean()
                        std_val = meter_data.std()
                        lower_bound = mean_val - threshold * std_val
                        upper_bound = mean_val + threshold * std_val
                        
                        # Cap outliers instead of removing them
                        meter_mask = df_processed['meter_id'] == meter_id
                        df_processed.loc[meter_mask, 'consumption'] = df_processed.loc[meter_mask, 'consumption'].clip(
                            lower=max(0, lower_bound), upper=upper_bound
                        )
                        
                        outliers_processed += outlier_mask.sum()
        
        print(f"Processed {outliers_processed} outliers")
        return df_processed
    
    def preprocess_pipeline(self, df: pd.DataFrame) -> pd.DataFrame:
        """Complete preprocessing pipeline"""
        print("Starting preprocessing pipeline...")
        
        # Handle missing values
        df_processed = self.handle_missing_values(df)
        
        # Handle outliers
        df_processed = self.detect_and_remove_outliers(df_processed)
        
        print("Preprocessing pipeline completed")
        return df_processed

## 4. Feature Engineering

In [None]:
class FeatureEngineer:
    """
    Feature engineering for electricity theft detection
    """
    
    def __init__(self):
        pass
    
    def create_time_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create time-based features"""
        print("Creating time-based features...")
        
        df['date'] = pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day_of_week'] = df['date'].dt.dayofweek
        df['day_of_year'] = df['date'].dt.dayofyear
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        
        # Season mapping
        df['season'] = df['month'].map({
            12: 'winter', 1: 'winter', 2: 'winter',
            3: 'spring', 4: 'spring', 5: 'spring',
            6: 'summer', 7: 'summer', 8: 'summer',
            9: 'autumn', 10: 'autumn', 11: 'autumn'
        })
        
        return df
    
    def create_statistical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create statistical features per meter"""
        print("Creating statistical features...")
        
        # Calculate rolling statistics
        df = df.sort_values(['meter_id', 'date'])
        
        # Rolling statistics (7-day window)
        print("  - 7-day rolling statistics...")
        df['consumption_7d_mean'] = df.groupby('meter_id')['consumption'].rolling(window=7, min_periods=1).mean().values
        df['consumption_7d_std'] = df.groupby('meter_id')['consumption'].rolling(window=7, min_periods=1).std().values
        df['consumption_7d_max'] = df.groupby('meter_id')['consumption'].rolling(window=7, min_periods=1).max().values
        df['consumption_7d_min'] = df.groupby('meter_id')['consumption'].rolling(window=7, min_periods=1).min().values
        
        # Rolling statistics (30-day window)
        print("  - 30-day rolling statistics...")
        df['consumption_30d_mean'] = df.groupby('meter_id')['consumption'].rolling(window=30, min_periods=1).mean().values
        df['consumption_30d_std'] = df.groupby('meter_id')['consumption'].rolling(window=30, min_periods=1).std().values
        
        # Lag features
        print("  - Lag features...")
        df['consumption_lag1'] = df.groupby('meter_id')['consumption'].shift(1)
        df['consumption_lag7'] = df.groupby('meter_id')['consumption'].shift(7)
        
        # Fill NaN values with forward fill then backward fill
        print("  - Filling missing values in features...")
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        df[numeric_cols] = df[numeric_cols].fillna(method='ffill').fillna(method='bfill')
        
        return df
    
    def create_aggregate_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create aggregate features per meter"""
        print("Creating aggregate features...")
        
        # Per meter aggregates
        meter_aggs = df.groupby('meter_id')['consumption'].agg([
            'mean', 'std', 'min', 'max', 'median',
            lambda x: x.quantile(0.25),  # Q1
            lambda x: x.quantile(0.75),  # Q3
        ]).round(4)
        
        # Add skewness and kurtosis safely
        try:
            meter_aggs['skew'] = df.groupby('meter_id')['consumption'].skew().round(4)
            meter_aggs['kurt'] = df.groupby('meter_id')['consumption'].apply(lambda x: x.kurtosis()).round(4)
        except:
            meter_aggs['skew'] = 0
            meter_aggs['kurt'] = 0
        
        meter_aggs.columns = ['meter_mean', 'meter_std', 'meter_min', 'meter_max', 
                              'meter_median', 'meter_q1', 'meter_q3', 'meter_skew', 'meter_kurt']
        
        # Calculate additional features
        meter_aggs['meter_range'] = meter_aggs['meter_max'] - meter_aggs['meter_min']
        meter_aggs['meter_iqr'] = meter_aggs['meter_q3'] - meter_aggs['meter_q1']
        meter_aggs['meter_cv'] = meter_aggs['meter_std'] / (meter_aggs['meter_mean'] + 1e-8)  # Coefficient of variation with small epsilon
        
        # Handle any infinite values
        meter_aggs = meter_aggs.replace([np.inf, -np.inf], 0)
        meter_aggs = meter_aggs.fillna(0)
        
        # Merge back to main dataframe
        df = df.merge(meter_aggs, left_on='meter_id', right_index=True, how='left')
        
        return df
    
    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Complete feature engineering pipeline"""
        print("Starting feature engineering...")
        
        # Create time features
        df = self.create_time_features(df)
        
        # Create statistical features
        df = self.create_statistical_features(df)
        
        # Create aggregate features
        df = self.create_aggregate_features(df)
        
        print(f"Feature engineering completed. Total features: {df.shape[1]}")
        return df

## 5. Load and Process Data

In [None]:
# Load and preprocess data
loader = SGCCDataLoader()
df = loader.load_dataset()

preprocessor = ElectricityDataPreprocessor()
df = preprocessor.preprocess_pipeline(df)

# Feature engineering
engineer = FeatureEngineer()
df = engineer.engineer_features(df)

print(f"Dataset shape after preprocessing and feature engineering: {df.shape}")
print(f"Class distribution:\n{df['label'].value_counts()}")
print(f"Theft rate: {(df['label'].sum() / len(df) * 100):.2f}%")

## 6. Prepare Features for Training

In [None]:
# Prepare features for training
feature_columns = [
    'consumption', 'year', 'month', 'day_of_week', 'day_of_year', 'is_weekend',
    'consumption_7d_mean', 'consumption_7d_std', 'consumption_7d_max', 'consumption_7d_min',
    'consumption_30d_mean', 'consumption_30d_std', 'consumption_lag1', 'consumption_lag7',
    'meter_mean', 'meter_std', 'meter_min', 'meter_max', 'meter_median',
    'meter_q1', 'meter_q3', 'meter_skew', 'meter_kurt', 'meter_range', 'meter_iqr', 'meter_cv'
]

# Encode categorical features
df_encoded = pd.get_dummies(df, columns=['season'], prefix='season')

# Update feature columns
season_cols = [col for col in df_encoded.columns if col.startswith('season_')]
feature_columns.extend(season_cols)

# Remove any features that don't exist
feature_columns = [col for col in feature_columns if col in df_encoded.columns]

print(f"Selected {len(feature_columns)} features for training")
print(f"Features: {feature_columns}")

X = df_encoded[feature_columns]
y = df_encoded['label']

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Missing values in features: {X.isnull().sum().sum()}")

In [None]:
# Handle any remaining missing values
X = X.fillna(X.median())
print(f"After filling missing values: {X.isnull().sum().sum()} missing values remain")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training class distribution:\n{y_train.value_counts()}")
print(f"Test class distribution:\n{y_test.value_counts()}")

In [None]:
# Handle class imbalance with SMOTE
print("Applying SMOTE for class balancing...")
smote = SMOTE(random_state=42, sampling_strategy=0.5)  # Oversample to 50% ratio
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"After SMOTE - Training set: {X_train_balanced.shape[0]} samples")
print(f"After SMOTE - Class distribution:\n{pd.Series(y_train_balanced).value_counts()}")
print(f"New theft rate in training: {(pd.Series(y_train_balanced).sum() / len(y_train_balanced) * 100):.2f}%")

In [None]:
# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed")
print(f"Training features mean: {np.mean(X_train_scaled):.4f}")
print(f"Training features std: {np.std(X_train_scaled):.4f}")

## 7. Model Training

In [None]:
# Train XGBoost model
print("Training XGBoost model...")

# XGBoost parameters optimized for theft detection
xgb_params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42,
    'eval_metric': 'auc'
}

model = xgb.XGBClassifier(**xgb_params)

# Train the model
model.fit(
    X_train_scaled, y_train_balanced,
    eval_set=[(X_test_scaled, y_test)],
    verbose=50  # Show progress every 50 rounds
)

print("Model training completed!")

## 8. Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
print("Model Performance:")
print("="*50)
print(f"AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"\nTrue Negatives: {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives: {cm[1,1]}")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
print(feature_importance.head(15))

# Show feature importance distribution
print(f"\nFeature importance statistics:")
print(f"Mean importance: {feature_importance['importance'].mean():.4f}")
print(f"Max importance: {feature_importance['importance'].max():.4f}")
print(f"Top 5 features account for {feature_importance.head(5)['importance'].sum():.1%} of total importance")

## 9. Save Model and Components

In [None]:
# Save the trained model and preprocessing components
print("Saving model and components...")

# Save the main model
joblib.dump(model, 'models/xgb_theft_detection_model.pkl')

# Save the scaler
joblib.dump(scaler, 'models/feature_scaler.pkl')

# Save feature columns
joblib.dump(feature_columns, 'models/feature_columns.pkl')

# Save model metadata
model_metadata = {
    'model_type': 'XGBoost',
    'features': feature_columns,
    'n_features': len(feature_columns),
    'training_samples': X_train_balanced.shape[0],
    'test_samples': X_test.shape[0],
    'test_auc': float(roc_auc_score(y_test, y_pred_proba)),
    'model_params': xgb_params,
    'feature_importance': feature_importance.head(20).to_dict('records'),  # Top 20 features
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'dataset_info': {
        'total_meters': df['meter_id'].nunique(),
        'total_records': len(df),
        'theft_rate': float((df['label'].sum() / len(df)) * 100),
        'date_range': f"{df['date'].min()} to {df['date'].max()}"
    }
}

# Save metadata as JSON
import json
with open('models/model_metadata.json', 'w') as f:
    json.dump([model_metadata], f, indent=2)

print("Model and components saved successfully!")
print("\nSaved files:")
print("- models/xgb_theft_detection_model.pkl")
print("- models/feature_scaler.pkl")
print("- models/feature_columns.pkl")
print("- models/model_metadata.json")

print(f"\nModel Performance Summary:")
print(f"AUC Score: {model_metadata['test_auc']:.4f}")
print(f"Training samples: {model_metadata['training_samples']:,}")
print(f"Test samples: {model_metadata['test_samples']:,}")

## 10. Download Models for Local Use

In [None]:
# Create a zip file with all model components
import zipfile
import os

zip_filename = 'electricity_theft_detection_model.zip'

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Add all model files
    for file_path in ['models/xgb_theft_detection_model.pkl', 
                      'models/feature_scaler.pkl',
                      'models/feature_columns.pkl',
                      'models/model_metadata.json']:
        if os.path.exists(file_path):
            zipf.write(file_path, os.path.basename(file_path))
            print(f"Added {file_path} to zip")

print(f"\nModel package created: {zip_filename}")
print("This file contains everything needed for local deployment.")

# Show zip file contents
with zipfile.ZipFile(zip_filename, 'r') as zipf:
    print("\nZip file contents:")
    for info in zipf.infolist():
        print(f"  - {info.filename} ({info.file_size:,} bytes)")

# Download the zip file
print("\nDownloading model package...")
files.download(zip_filename)

## 11. Test Model Prediction Function

In [None]:
# Test the model with sample data
def predict_sample(model, scaler, feature_columns, sample_data):
    """Test function to predict on sample data"""
    
    # Ensure all required features are present
    for col in feature_columns:
        if col not in sample_data.columns:
            sample_data[col] = 0  # Fill missing features with default values
    
    # Select and reorder features
    X_sample = sample_data[feature_columns]
    
    # Scale features
    X_sample_scaled = scaler.transform(X_sample)
    
    # Make prediction
    prediction = model.predict(X_sample_scaled)
    probability = model.predict_proba(X_sample_scaled)[:, 1]
    
    return prediction, probability

# Test with a few samples from test set
test_samples = X_test.head(10)
test_labels = y_test.head(10)

predictions, probabilities = predict_sample(model, scaler, feature_columns, test_samples)

print("Sample Predictions:")
print("="*70)
print(f"{'Sample':<8} {'Prediction':<12} {'Probability':<12} {'Actual':<8} {'Result':<10}")
print("-" * 70)

for i, (pred, prob, actual) in enumerate(zip(predictions, probabilities, test_labels)):
    result = "✓ Correct" if pred == actual else "✗ Wrong"
    print(f"{i+1:<8} {pred:<12} {prob:<12.3f} {actual:<8} {result:<10}")

accuracy = sum(predictions == test_labels) / len(test_labels)
print("-" * 70)
print(f"Sample accuracy: {accuracy:.1%}")

print("\n🎉 Model is ready for deployment!")
print("Next steps:")
print("1. Download the model package zip file")
print("2. Extract it in your local project")
print("3. Use the deployment scripts to make predictions")