In [1]:
import os
import json
import yaml
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import save_model

# ==============================
# 1. LOAD DATA
# ==============================

file_path = r"C:\Users\NXTWAVE\Downloads\Strategic Disinvestment Revenue Prediction\fetchOGDReceiptStrategicSale.json"

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data)

print("Original Shape:", df.shape)

# ==============================
# 2. DATA CLEANING
# ==============================

# Clean amountRealised
df['amountRealised'] = df['amountRealised'].astype(str)
df['amountRealised'] = df['amountRealised'].str.replace(",", "")
df['amountRealised'] = df['amountRealised'].str.replace("*", "")
df['amountRealised'] = pd.to_numeric(df['amountRealised'], errors='coerce')

# Clean equitySold
df['equitySold'] = pd.to_numeric(df['equitySold'], errors='coerce')

# Extract first year number
df['year_clean'] = df['year'].str.extract(r'(\d{4})')
df['year_clean'] = pd.to_numeric(df['year_clean'], errors='coerce')

df = df.dropna(subset=['amountRealised', 'equitySold', 'year_clean'])

print("Cleaned Shape:", df.shape)

# ==============================
# 3. FEATURE ENGINEERING
# ==============================

X = df[['year_clean', 'equitySold']]
y = df['amountRealised']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ==============================
# 4. TRAIN TEST SPLIT
# ==============================

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ==============================
# 5. RANDOM FOREST MODEL
# ==============================

rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print("\nRandom Forest Performance")
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

# Save .pkl
joblib.dump(rf_model, "disinvestment_model.pkl")
print("Saved: disinvestment_model.pkl")

# ==============================
# 6. KERAS NEURAL NETWORK MODEL
# ==============================

nn_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

nn_model.compile(optimizer='adam', loss='mse')

nn_model.fit(X_train, y_train, epochs=100, verbose=0)

# Save .h5
nn_model.save("disinvestment_model.h5")
print("Saved: disinvestment_model.h5")

# ==============================
# 7. SAVE YAML CONFIG
# ==============================

config_data = {
    "features": ["year_clean", "equitySold"],
    "target": "amountRealised",
    "model_type": "RandomForest + NeuralNetwork",
    "dataset_size": int(len(df))
}

with open("model_config.yaml", "w") as yaml_file:
    yaml.dump(config_data, yaml_file)

print("Saved: model_config.yaml")

# ==============================
# 8. SAVE PROCESSED CSV
# ==============================

df.to_csv("processed_disinvestment_data.csv", index=False)
print("Saved: processed_disinvestment_data.csv")

# ==============================
# 9. SAVE PREDICTIONS CSV
# ==============================

pred_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})

pred_df.to_csv("prediction_results.csv", index=False)
print("Saved: prediction_results.csv")

# ==============================
# 10. PLOT GRAPH
# ==============================

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Revenue")
plt.ylabel("Predicted Revenue")
plt.title("Actual vs Predicted Disinvestment Revenue")
plt.savefig("prediction_plot.png")
plt.close()

print("Saved: prediction_plot.png")

print("\n✅ ALL FILES GENERATED SUCCESSFULLY!")


Original Shape: (43, 9)
Cleaned Shape: (43, 10)

Random Forest Performance
R2 Score: 0.06043356866258931
MAE: 5964.74442816748
Saved: disinvestment_model.pkl



Saved: disinvestment_model.h5
Saved: model_config.yaml
Saved: processed_disinvestment_data.csv
Saved: prediction_results.csv
Saved: prediction_plot.png

✅ ALL FILES GENERATED SUCCESSFULLY!


  saving_api.save_model(
