In [1]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# =====================================
# 1Ô∏è‚É£ DEFINE PATHS
# =====================================

base_path = r"C:\Users\NXTWAVE\Downloads\Strategic Disinvestment Revenue Prediction"

file_path = os.path.join(base_path, "fetchOGDReceiptStrategicSale.json")

# =====================================
# 2Ô∏è‚É£ LOAD DATA
# =====================================

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data)

print("Original Shape:", df.shape)

# =====================================
# 3Ô∏è‚É£ DATA CLEANING
# =====================================

df['amountRealised'] = df['amountRealised'].astype(str)
df['amountRealised'] = df['amountRealised'].str.replace(",", "")
df['amountRealised'] = df['amountRealised'].str.replace("*", "")
df['amountRealised'] = pd.to_numeric(df['amountRealised'], errors='coerce')

df['equitySold'] = pd.to_numeric(df['equitySold'], errors='coerce')

df['year_clean'] = df['year'].str.extract(r'(\d{4})')
df['year_clean'] = pd.to_numeric(df['year_clean'], errors='coerce')

df = df.dropna(subset=['amountRealised', 'equitySold', 'year_clean'])

print("Cleaned Shape:", df.shape)

# =====================================
# 4Ô∏è‚É£ FEATURE ENGINEERING
# =====================================

X = df[['year_clean', 'equitySold']]
y = df['amountRealised']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =====================================
# 5Ô∏è‚É£ TRAIN TEST SPLIT
# =====================================

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# =====================================
# 6Ô∏è‚É£ RANDOM FOREST MODEL
# =====================================

rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print("\nRandom Forest Performance")
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

# Save PKL
joblib.dump(rf_model, os.path.join(base_path, "disinvestment_model.pkl"))
print("Saved: disinvestment_model.pkl")

# =====================================
# 7Ô∏è‚É£ NEURAL NETWORK MODEL (.h5)
# =====================================

nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

nn_model.compile(optimizer='adam', loss='mse')
nn_model.fit(X_train, y_train, epochs=100, verbose=0)

nn_model.save(os.path.join(base_path, "disinvestment_model.h5"))
print("Saved: disinvestment_model.h5")

# =====================================
# 8Ô∏è‚É£ SAVE YAML CONFIG
# =====================================

config_data = {
    "features": ["year_clean", "equitySold"],
    "target": "amountRealised",
    "model_type": "RandomForest + NeuralNetwork",
    "dataset_size": int(len(df))
}

with open(os.path.join(base_path, "model_config.yaml"), "w") as yaml_file:
    yaml.dump(config_data, yaml_file)

print("Saved: model_config.yaml")

# =====================================
# 9Ô∏è‚É£ SAVE PROCESSED CSV
# =====================================

df.to_csv(os.path.join(base_path, "processed_disinvestment_data.csv"), index=False)
print("Saved: processed_disinvestment_data.csv")

# =====================================
# üîü SAVE PREDICTION CSV
# =====================================

pred_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})

pred_df.to_csv(os.path.join(base_path, "prediction_results.csv"), index=False)
print("Saved: prediction_results.csv")

# =====================================
# 11Ô∏è‚É£ SAVE PLOT
# =====================================

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Revenue")
plt.ylabel("Predicted Revenue")
plt.title("Actual vs Predicted Disinvestment Revenue")
plt.savefig(os.path.join(base_path, "prediction_plot.png"))
plt.close()

print("Saved: prediction_plot.png")

print("\n‚úÖ ALL FILES SAVED SUCCESSFULLY IN YOUR FOLDER!")


Original Shape: (43, 9)
Cleaned Shape: (43, 10)

Random Forest Performance
R2 Score: 0.06043356866258931
MAE: 5964.74442816748
Saved: disinvestment_model.pkl



Saved: disinvestment_model.h5
Saved: model_config.yaml
Saved: processed_disinvestment_data.csv
Saved: prediction_results.csv
Saved: prediction_plot.png

‚úÖ ALL FILES SAVED SUCCESSFULLY IN YOUR FOLDER!


  saving_api.save_model(
