# MSME Forecasting - Full Notebook
This notebook is part of a 4-notebook package: Linear Regression, SARIMA, Random Forest, and Model Comparison.
Set `DATASET_INDEX` to choose which CSV from `final_msme_datasets` to use. Forecast horizon is monthly and default is 6 months.


In [1]:
# Shared imports and helper functions
import os, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print('helpers imported')
def find_csv_files(base='final_msme_datasets'):
    # return sorted csv files, handle nested folder
    if not os.path.isdir(base):
        return []
    files = glob.glob(os.path.join(base, '*.csv'))
    if len(files)==0:
        # check nested
        for d in os.listdir(base):
            p = os.path.join(base,d)
            if os.path.isdir(p):
                files += glob.glob(os.path.join(p,'*.csv'))
    return sorted(files)


helpers imported


In [2]:
from sklearn.ensemble import RandomForestRegressor
DATASET_INDEX = 1
HORIZON = 6
files = find_csv_files()
print('Found', len(files), 'files')
dataset_path = files[DATASET_INDEX-1]
df = pd.read_csv(dataset_path)
print('Using', dataset_path)


Found 5 files
Using final_msme_datasets\final_msme_dataset_1.csv


In [3]:
# Preprocess
df['Total Income (₹)'] = df['Sales Revenue (₹)'] + df['Service Fees (₹)']
df['Total Expenses (₹)'] = df[['Rent (₹)','Utilities (₹)','Salaries & Wages (₹)','Raw Materials / Inventory (₹)','Transportation / Logistics (₹)','Loan Repayments & Interest (₹)']].sum(axis=1)
df['Net Cash Flow (₹)'] = df['Total Income (₹)'] - df['Total Expenses (₹)']
df['Month_Index'] = np.arange(1,len(df)+1)
feature_cols = ['Month_Index','Number of Orders','Customers','Average Order Value','Seasonality_Flag','Fuel Price Index']
for c in feature_cols:
    if c not in df.columns:
        df[c]=0
X = df[feature_cols]
y_inc = df['Total Income (₹)']
y_exp = df['Total Expenses (₹)']
train_size = int(0.8 * len(df))
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_inc_train, y_inc_test = y_inc.iloc[:train_size], y_inc.iloc[train_size:]
y_exp_train, y_exp_test = y_exp.iloc[:train_size], y_exp.iloc[train_size:]
print('Train/test shapes', X_train.shape, X_test.shape)


Train/test shapes (800, 6) (200, 6)


In [5]:
rf_inc = RandomForestRegressor(n_estimators=200, random_state=42).fit(X_train, y_inc_train)
rf_exp = RandomForestRegressor(n_estimators=200, random_state=42).fit(X_train, y_exp_train)
y_inc_pred = rf_inc.predict(X_test)
y_exp_pred = rf_exp.predict(X_test)
def metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2
inc_mae, inc_rmse, inc_r2 = metrics(y_inc_test, y_inc_pred)
exp_mae, exp_rmse, exp_r2 = metrics(y_exp_test, y_exp_pred)
print('RF Income -> MAE:', round(inc_mae,2), 'RMSE:', round(inc_rmse,2), 'R2:', round(inc_r2,3))
print('RF Expenses-> MAE:', round(exp_mae,2), 'RMSE:', round(exp_rmse,2), 'R2:', round(exp_r2,3))


RF Income -> MAE: 73478.41 RMSE: 93069.37 R2: 0.364
RF Expenses-> MAE: 61431.75 RMSE: 74377.8 R2: -0.17


In [6]:
# Forecast next HORIZON months using simple future-feature simulation (can be improved)
last_idx = int(df['Month_Index'].iloc[-1])
future_idx = np.arange(last_idx+1, last_idx+1+HORIZON)
future_features = pd.DataFrame({
    'Month_Index': future_idx,
    'Number of Orders': int(df['Number of Orders'].mean()),
    'Customers': int(df['Customers'].mean()),
    'Average Order Value': df['Average Order Value'].mean(),
    'Seasonality_Flag': [1 if ((i-1)%12+1 in [3,4,8,9,10,12]) else 0 for i in future_idx],
    'Fuel Price Index': df['Fuel Price Index'].iloc[-1]
})
future_inc = rf_inc.predict(future_features)
future_exp = rf_exp.predict(future_features)
future_cf = future_inc - future_exp
forecast = pd.DataFrame({'Predicted_Income': future_inc.round(2), 'Predicted_Expenses': future_exp.round(2), 'Predicted_CashFlow': future_cf.round(2)})
forecast['Alert'] = np.where(forecast['Predicted_Expenses']>forecast['Predicted_Income'], 'Yes','No')
display(forecast)
os.makedirs('rf_outputs', exist_ok=True)
forecast.to_csv('rf_outputs/forecast_rf.csv', index=False)
pd.DataFrame([['RandomForest','Income',inc_mae,inc_rmse,inc_r2], ['RandomForest','Expenses',exp_mae,exp_rmse,exp_r2]], columns=['Model','Target','MAE','RMSE','R2']).to_csv('rf_outputs/metrics_rf.csv', index=False)
print('Saved RF outputs to rf_outputs/')


Unnamed: 0,Predicted_Income,Predicted_Expenses,Predicted_CashFlow,Alert
0,381645.74,352829.34,28816.4,No
1,381645.74,352829.34,28816.4,No
2,381645.74,352829.34,28816.4,No
3,399747.26,341771.41,57975.86,No
4,399747.26,341771.41,57975.86,No
5,399747.26,341771.41,57975.86,No


Saved RF outputs to rf_outputs/


In [7]:
# Feature importance and recommendations
importances = pd.Series(rf_inc.feature_importances_, index=feature_cols).sort_values(ascending=False)
print('Feature importances (income):')
print(importances)
recs = []
if importances.get('Average Order Value',0) > 0.15:
    recs.append('Increase AOV: bundling, upsell, premium sku')
if importances.get('Number of Orders',0) > 0.12:
    recs.append('Increase order frequency via loyalty programs and push notifications')
if df['Seasonality_Flag'].sum()>0:
    recs.append('Plan promotions & inventory for seasonal months')
print('\nRecommendations:')
for r in recs:
    print('-', r)


Feature importances (income):
Average Order Value    0.461367
Number of Orders       0.246715
Fuel Price Index       0.097494
Customers              0.093401
Month_Index            0.084816
Seasonality_Flag       0.016207
dtype: float64

Recommendations:
- Increase AOV: bundling, upsell, premium sku
- Increase order frequency via loyalty programs and push notifications
- Plan promotions & inventory for seasonal months
