In [None]:
# STEP 1: Install necessary libraries
!pip install prophet --quiet
!pip install openpyxl --quiet

# STEP 2: Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from prophet import Prophet
from google.colab import files
from tqdm import tqdm

# STEP 3: Upload the files
print("📁 Upload Train.csv file")
uploaded = files.upload()
train_file = next(iter(uploaded))

print("📁 Upload Test.csv file")
uploaded = files.upload()
test_file = next(iter(uploaded))

# STEP 4: Load datasets
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# STEP 5: Preprocess Train data
train_df['Date'] = pd.to_datetime(train_df['Date'], errors='coerce')
train_df = train_df.dropna(subset=['Date'])  # Remove rows with invalid dates
train_df['MonthStart'] = train_df['Date'].dt.to_period('M').dt.to_timestamp()

# Aggregate monthly crime counts by TYPE
monthly_grouped = train_df.groupby(['MonthStart', 'TYPE']).size().reset_index(name='Crime_Count')

# STEP 6: Prepare Test data
test_df['MonthStart'] = pd.to_datetime(test_df['YEAR'].astype(str) + '-' + test_df['MONTH'].astype(str) + '-01')

# Initialize predictions
test_df['Incident_Counts'] = 0

# STEP 7: Forecast using Prophet for each TYPE
print("🔮 Forecasting incident counts per TYPE...")
for crime_type in tqdm(test_df['TYPE'].unique()):
    # Get training data for this crime type
    crime_data = monthly_grouped[monthly_grouped['TYPE'] == crime_type]
    if len(crime_data) < 2:
        continue  # Not enough data to train Prophet

    # Format for Prophet
    prophet_df = crime_data.rename(columns={'MonthStart': 'ds', 'Crime_Count': 'y'})[['ds', 'y']]

    # Build and fit model
    model = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
    model.fit(prophet_df)

    # Prepare future dataframe from test set
    future_dates = test_df[test_df['TYPE'] == crime_type]['MonthStart'].sort_values().unique()
    future_df = pd.DataFrame({'ds': future_dates})

    # Forecast
    forecast = model.predict(future_df)

    # Assign predictions back to test_df
    for ds, yhat in zip(forecast['ds'], forecast['yhat']):
        test_df.loc[(test_df['TYPE'] == crime_type) & (test_df['MonthStart'] == ds), 'Incident_Counts'] = max(0, int(round(yhat)))

# STEP 8: Export results
output_file = "Predicted_Test.csv"
test_df.drop(columns=['MonthStart'], inplace=True)
test_df.to_csv(output_file, index=False)
print(f"✅ Forecasting complete. File saved as: {output_file}")

# STEP 9: Download result
files.download(output_file)


📁 Upload Train.csv file


Saving Train.xlsx - Train.csv to Train.xlsx - Train (1).csv
📁 Upload Test.csv file


Saving Test (2).csv to Test (2) (1).csv
🔮 Forecasting incident counts per TYPE...


  0%|          | 0/9 [00:00<?, ?it/s]DEBUG:cmdstanpy:input tempfile: /tmp/tmpv4ototpi/4920vysc.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpv4ototpi/4sw9btv6.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=57161', 'data', 'file=/tmp/tmpv4ototpi/4920vysc.json', 'init=/tmp/tmpv4ototpi/4sw9btv6.json', 'output', 'file=/tmp/tmpv4ototpi/prophet_modelshz8_wtl/prophet_model-20250606034926.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
03:49:26 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
03:49:27 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
 11%|█         | 1/9 [00:00<00:06,  1.17it/s]DEBUG:cmdstanpy:input tempfile: /tmp/tmpv4ototpi/ax_9izqg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpv4ototpi/4g9m2454.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdst

✅ Forecasting complete. File saved as: Predicted_Test.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install prophet --quiet
!pip install pmdarima --quiet


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.2/2.2 MB[0m [31m106.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip uninstall -y pmdarima numpy
!pip install numpy==1.24.4
!pip install pmdarima --no-cache-dir


Found existing installation: pmdarima 2.0.4
Uninstalling pmdarima-2.0.4:
  Successfully uninstalled pmdarima-2.0.4
Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.24.4 which is incompatible.
blosc2 3.3.4 requires numpy>=1.26, but you have numpy 1.24.4 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you h

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (7.8 kB)
Downloading pmdarima-2.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m133.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm

# Load datasets
train_df = pd.read_csv('Train.xlsx - Train.csv')
test_df = pd.read_csv('Test (2).csv')

# --- EDA and Null Handling ---
train_df['Date'] = pd.to_datetime(train_df['Date'], errors='coerce')
train_df.dropna(subset=['Date'], inplace=True)
train_df['MonthStart'] = train_df['Date'].dt.to_period('M').dt.to_timestamp()

# Grouping by month and type
monthly_grouped = train_df.groupby(['MonthStart', 'TYPE']).size().reset_index(name='Crime_Count')

# Preprocess test data
test_df['MonthStart'] = pd.to_datetime(test_df['YEAR'].astype(str) + '-' + test_df['MONTH'].astype(str) + '-01')
test_df['Incident_Counts'] = 0

# Unique crime types
types = test_df['TYPE'].unique()

# Model performance storage
model_perf = []

print("🔁 Running per TYPE...")

for crime_type in tqdm(types):
    data = monthly_grouped[monthly_grouped['TYPE'] == crime_type].copy()
    if len(data) < 24:
        print(f"Skipping {crime_type} (insufficient data: {len(data)} months)")
        continue

    df_prep = data.set_index('MonthStart').asfreq('MS').fillna(0)
    y = df_prep['Crime_Count']

    # Split train/validation
        # Use last 12 months for validation
    if len(y) < 24:
        print(f"Skipping {crime_type} (not enough data for train/valid split)")
        continue

    train_y = y[:-12]
    valid_y = y[-12:]

    if train_y.empty or valid_y.empty:
        print(f"Skipping {crime_type} (empty train or validation set)")
        continue


    # ARIMA
    try:
        arima_model = auto_arima(train_y, seasonal=True, m=12, suppress_warnings=True, error_action='ignore')
        arima_preds = arima_model.predict(n_periods=len(valid_y))
        arima_rmse = np.sqrt(mean_squared_error(valid_y, arima_preds))
        arima_mae = mean_absolute_error(valid_y, arima_preds)
    except Exception as e:
        print(f"ARIMA failed for {crime_type}: {e}")
        arima_rmse = arima_mae = float('inf')

    # Prophet
    try:
        prophet_train = train_y.reset_index().rename(columns={'MonthStart': 'ds', 'Crime_Count': 'y'})
        prophet_model = Prophet(yearly_seasonality=True, daily_seasonality=False)
        prophet_model.fit(prophet_train)
        future = pd.DataFrame({'ds': valid_y.index})
        forecast = prophet_model.predict(future)
        prophet_preds = forecast['yhat'].values
        prophet_rmse = np.sqrt(mean_squared_error(valid_y, prophet_preds))
        prophet_mae = mean_absolute_error(valid_y, prophet_preds)
    except Exception as e:
        print(f"Prophet failed for {crime_type}: {e}")
        prophet_rmse = prophet_mae = float('inf')

    better_model = 'Prophet' if prophet_rmse < arima_rmse else 'ARIMA'
    model_perf.append([crime_type, prophet_rmse, arima_rmse, better_model])

    # Forecast future months using best model
    future_months = test_df[test_df['TYPE'] == crime_type]['MonthStart'].sort_values().unique()
    if better_model == 'Prophet':
        full_train = y.reset_index().rename(columns={'MonthStart': 'ds', 'Crime_Count': 'y'})
        model = Prophet(yearly_seasonality=True, daily_seasonality=False)
        model.fit(full_train)
        future = pd.DataFrame({'ds': future_months})
        forecast = model.predict(future)
        preds = forecast['yhat'].values
    else:
        model = auto_arima(y, seasonal=True, m=12, suppress_warnings=True, error_action='ignore')
        preds = model.predict(n_periods=len(future_months))

    # Assign to test
    test_df.loc[test_df['TYPE'] == crime_type, 'Incident_Counts'] = [max(0, int(round(p))) for p in preds]

# Save and show performance
perf_df = pd.DataFrame(model_perf, columns=['TYPE', 'Prophet_RMSE', 'ARIMA_RMSE', 'Best_Model'])
print(perf_df)
perf_df.to_csv('model_performance.csv', index=False)
test_df.to_csv('final_test_predictions.csv', index=False)


🔁 Running per TYPE...


INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmp23durjrn/baszz47o.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp23durjrn/rpmj8suh.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=90509', 'data', 'file=/tmp/tmp23durjrn/baszz47o.json', 'init=/tmp/tmp23durjrn/rpmj8suh.json', 'output', 'file=/tmp/tmp23durjrn/prophet_modelewuj3s1r/prophet_model-20250606045234.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
04:52:34 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
04:52:34 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/t

                                                TYPE  Prophet_RMSE  \
0  Vehicle Collision or Pedestrian Struck (with I...     16.756686   
1                                   Theft of Vehicle     37.951372   
2                                   Theft of Bicycle     94.593325   
3                                 Theft from Vehicle    153.515575   
4                                        Other Theft     52.041744   
5                           Offence Against a Person     23.889873   
6                                           Mischief     43.526594   
7                  Break and Enter Residential/Other     43.227522   
8                         Break and Enter Commercial     43.216079   

   ARIMA_RMSE Best_Model  
0   14.417311      ARIMA  
1   48.677744    Prophet  
2  103.232232    Prophet  
3  154.232439    Prophet  
4   49.427588      ARIMA  
5   35.667761    Prophet  
6   28.689209      ARIMA  
7   45.932322    Prophet  
8   24.613720      ARIMA  



