# MSME Forecasting - Full Notebook
This notebook is part of a 4-notebook package: Linear Regression, SARIMA, Random Forest, and Model Comparison.
Set `DATASET_INDEX` to choose which CSV from `final_msme_datasets` to use. Forecast horizon is monthly and default is 6 months.


In [1]:
# Shared imports and helper functions
import os, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print('helpers imported')
def find_csv_files(base='final_msme_datasets'):
    # return sorted csv files, handle nested folder
    if not os.path.isdir(base):
        return []
    files = glob.glob(os.path.join(base, '*.csv'))
    if len(files)==0:
        # check nested
        for d in os.listdir(base):
            p = os.path.join(base,d)
            if os.path.isdir(p):
                files += glob.glob(os.path.join(p,'*.csv'))
    return sorted(files)


helpers imported


In [10]:
# Use statsmodels SARIMAX instead of pmdarima
from statsmodels.tsa.statespace.sarimax import SARIMAX
DATASET_INDEX = 1
HORIZON = 6
files = find_csv_files()
print('Found', len(files), 'files')
dataset_path = files[DATASET_INDEX-1]
df = pd.read_csv(dataset_path)
print('Loaded', dataset_path)
df['Total Income (₹)'] = df['Sales Revenue (₹)'] + df['Service Fees (₹)']
df['Total Expenses (₹)'] = df[['Rent (₹)','Utilities (₹)','Salaries & Wages (₹)','Raw Materials / Inventory (₹)','Transportation / Logistics (₹)','Loan Repayments & Interest (₹)']].sum(axis=1)
df['Net Cash Flow (₹)'] = df['Total Income (₹)'] - df['Total Expenses (₹)']
train_size = int(0.8 * len(df))
train_inc, test_inc = df['Total Income (₹)'][:train_size], df['Total Income (₹)'][train_size:]
train_exp, test_exp = df['Total Expenses (₹)'][:train_size], df['Total Expenses (₹)'][train_size:]
print('Train/test sizes', len(train_inc), len(test_inc))

Found 5 files
Loaded final_msme_datasets\final_msme_dataset_1.csv
Train/test sizes 800 200


In [12]:
# SARIMA with statsmodels (yearly seasonality m=12, example order, tune as needed)
sarima_inc = SARIMAX(train_inc, order=(1,1,1), seasonal_order=(1,1,1,12)).fit(disp=False)
sarima_exp = SARIMAX(train_exp, order=(1,1,1), seasonal_order=(1,1,1,12)).fit(disp=False)
y_inc_pred = sarima_inc.predict(start=train_size, end=train_size+len(test_inc)-1)
y_exp_pred = sarima_exp.predict(start=train_size, end=train_size+len(test_exp)-1)
def metrics(y_true,y_pred):
    mae = mean_absolute_error(y_true,y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true,y_pred)
    return mae, rmse, r2
inc_mae, inc_rmse, inc_r2 = metrics(test_inc, y_inc_pred)
exp_mae, exp_rmse, exp_r2 = metrics(test_exp, y_exp_pred)
print('SARIMA Income -> MAE:', round(inc_mae,2), 'RMSE:', round(inc_rmse,2), 'R2:', round(inc_r2,3))
print('SARIMA Expenses-> MAE:', round(exp_mae,2), 'RMSE:', round(exp_rmse,2), 'R2:', round(exp_r2,3))

SARIMA Income -> MAE: 103251.64 RMSE: 121322.31 R2: -0.08
SARIMA Expenses-> MAE: 71130.99 RMSE: 85871.12 R2: -0.559


In [14]:
# Forecast next HORIZON months using SARIMAX fitted on full series
sarima_inc_full = SARIMAX(df['Total Income (₹)'], order=(1,1,1), seasonal_order=(1,1,1,12)).fit(disp=False)
sarima_exp_full = SARIMAX(df['Total Expenses (₹)'], order=(1,1,1), seasonal_order=(1,1,1,12)).fit(disp=False)
future_inc = sarima_inc_full.predict(start=len(df), end=len(df)+HORIZON-1)
future_exp = sarima_exp_full.predict(start=len(df), end=len(df)+HORIZON-1)
future_cf = future_inc - future_exp
forecast = pd.DataFrame({'Predicted_Income': future_inc.round(2), 'Predicted_Expenses': future_exp.round(2), 'Predicted_CashFlow': future_cf.round(2)})
forecast['Alert'] = np.where(forecast['Predicted_Expenses']>forecast['Predicted_Income'], 'Yes', 'No')
display(forecast)
os.makedirs('sarima_outputs', exist_ok=True)
forecast.to_csv('sarima_outputs/forecast_sarima.csv', index=False)
pd.DataFrame([['SARIMA','Income',inc_mae,inc_rmse,inc_r2], ['SARIMA','Expenses',exp_mae,exp_rmse,exp_r2]], columns=['Model','Target','MAE','RMSE','R2']).to_csv('sarima_outputs/metrics_sarima.csv', index=False)
print('Saved SARIMA outputs to sarima_outputs/')

Unnamed: 0,Predicted_Income,Predicted_Expenses,Predicted_CashFlow,Alert
1000,297477.9,332602.88,-35124.98,Yes
1001,313622.76,355321.05,-41698.29,Yes
1002,288703.77,336523.69,-47819.92,Yes
1003,275607.36,343536.62,-67929.27,Yes
1004,305110.42,373125.87,-68015.45,Yes
1005,293929.24,354670.23,-60740.99,Yes


Saved SARIMA outputs to sarima_outputs/
