In [1]:
import os
import json
import random
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# =========================================
# 1Ô∏è‚É£ PATH
# =========================================

base_path = r"C:\Users\NXTWAVE\Downloads\Strategic Disinvestment Revenue Prediction"
file_path = os.path.join(base_path, "fetchOGDReceiptStrategicSale.json")

# =========================================
# 2Ô∏è‚É£ LOAD & CLEAN DATA
# =========================================

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data)

df['amountRealised'] = df['amountRealised'].astype(str)
df['amountRealised'] = df['amountRealised'].str.replace(",", "")
df['amountRealised'] = df['amountRealised'].str.replace("*", "")
df['amountRealised'] = pd.to_numeric(df['amountRealised'], errors='coerce')

df['equitySold'] = pd.to_numeric(df['equitySold'], errors='coerce')
df['year_clean'] = df['year'].str.extract(r'(\d{4})')
df['year_clean'] = pd.to_numeric(df['year_clean'], errors='coerce')

df = df.dropna(subset=['amountRealised','equitySold','year_clean'])

# =========================================
# 3Ô∏è‚É£ HEATMAP
# =========================================

plt.figure(figsize=(6,5))
sns.heatmap(df[['year_clean','equitySold','amountRealised']].corr(),
            annot=True, cmap='coolwarm')
plt.title("Hybrid Correlation Heatmap")
plt.savefig(os.path.join(base_path,"hybrid_heatmap.png"))
plt.close()

# =========================================
# 4Ô∏è‚É£ FEATURE SETUP
# =========================================

X = df[['year_clean','equitySold']]
y = df['amountRealised']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# =========================================
# 5Ô∏è‚É£ AIS (Feature Weight Optimization)
# =========================================

def ais_optimize(X_train, y_train):
    best_score = -999
    best_weights = None
    
    for _ in range(20):  # antibodies
        weights = np.random.uniform(0.5,1.5,size=X_train.shape[1])
        X_weighted = X_train * weights
        
        model = RandomForestRegressor(n_estimators=100)
        model.fit(X_weighted,y_train)
        pred = model.predict(X_weighted)
        score = r2_score(y_train,pred)
        
        if score > best_score:
            best_score = score
            best_weights = weights
    
    return best_weights

feature_weights = ais_optimize(X_train,y_train)

X_train_ais = X_train * feature_weights
X_test_ais = X_test * feature_weights

# =========================================
# 6Ô∏è‚É£ CSA (Hyperparameter Optimization)
# =========================================

def csa_optimize(X_train,y_train):
    best_score = -999
    best_params = None
    
    for _ in range(20):  # nests
        n_estimators = random.randint(100,300)
        max_depth = random.randint(3,15)
        
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth
        )
        model.fit(X_train,y_train)
        pred = model.predict(X_train)
        score = r2_score(y_train,pred)
        
        if score > best_score:
            best_score = score
            best_params = (n_estimators,max_depth)
    
    return best_params

best_n, best_depth = csa_optimize(X_train_ais,y_train)

# =========================================
# 7Ô∏è‚É£ FINAL HYBRID MODEL
# =========================================

hybrid_model = RandomForestRegressor(
    n_estimators=best_n,
    max_depth=best_depth
)

hybrid_model.fit(X_train_ais,y_train)

y_pred = hybrid_model.predict(X_test_ais)

hybrid_r2 = r2_score(y_test,y_pred)
hybrid_rmse = np.sqrt(mean_squared_error(y_test,y_pred))

print("Hybrid R2:",hybrid_r2)
print("Hybrid RMSE:",hybrid_rmse)

# Save model
joblib.dump(hybrid_model,
            os.path.join(base_path,"hybrid_model.pkl"))

# =========================================
# 8Ô∏è‚É£ ACCURACY GRAPH
# =========================================

plt.figure(figsize=(6,5))
plt.bar(["Hybrid AIS+CSA"],[hybrid_r2])
plt.title("Hybrid Model Accuracy (R2)")
plt.ylabel("R2 Score")
plt.savefig(os.path.join(base_path,"hybrid_accuracy_graph.png"))
plt.close()

# =========================================
# 9Ô∏è‚É£ RMSE GRAPH
# =========================================

plt.figure(figsize=(6,5))
plt.bar(["Hybrid AIS+CSA"],[hybrid_rmse])
plt.title("Hybrid Model RMSE")
plt.ylabel("RMSE")
plt.savefig(os.path.join(base_path,"hybrid_rmse_graph.png"))
plt.close()

# =========================================
# üîü SAVE RESULT CSV
# =========================================

results = pd.DataFrame({
    "Model":["Hybrid AIS+CSA"],
    "R2":[hybrid_r2],
    "RMSE":[hybrid_rmse],
    "Best_n_estimators":[best_n],
    "Best_max_depth":[best_depth]
})

results.to_csv(os.path.join(base_path,"hybrid_results.csv"),
               index=False)

# =========================================
# 11Ô∏è‚É£ SAVE PREDICTIONS CSV
# =========================================

pred_df = pd.DataFrame({
    "Actual":y_test.values,
    "Hybrid_Predicted":y_pred
})

pred_df.to_csv(os.path.join(base_path,
            "hybrid_prediction.csv"),
            index=False)

# =========================================
# 12Ô∏è‚É£ PREDICTION GRAPH
# =========================================

plt.figure(figsize=(7,6))
plt.scatter(y_test,y_pred)
plt.xlabel("Actual Revenue")
plt.ylabel("Hybrid Predicted Revenue")
plt.title("Hybrid AIS+CSA Prediction")
plt.savefig(os.path.join(base_path,
            "hybrid_prediction_graph.png"))
plt.close()

print("\nüî• HYBRID AIS + CSA MODEL COMPLETED SUCCESSFULLY!")

Hybrid R2: 0.01930861593877653
Hybrid RMSE: 11427.41547495733

üî• HYBRID AIS + CSA MODEL COMPLETED SUCCESSFULLY!
