In [None]:

# 1. Imported Required Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
import sys

# 2. Loaded Data 
try:
    df = pd.read_csv('blastfurnacedata.csv', encoding='utf-8-sig')
except Exception as e:
    print("Error loading CSV:", e)
    sys.exit()

# 3. Renamed Columns
df = df.rename(columns={
    'ï»¿fuel rate': 'fuel_rate',
    'fuel rate': 'fuel_rate',
    'moisture': 'moisture',
    'co efficiency': 'co_efficiency',
    'hot blast temp': 'hot_blast_temp',
    'slag rate': 'slag_rate',
    'flame temp': 'flame_temp',
    'top pressure': 'top_pressure'
})

# 4. Defined and Checked Required Columns
required_cols = ['moisture', 'co_efficiency', 'hot_blast_temp', 
                 'slag_rate', 'flame_temp', 'top_pressure', 'fuel_rate']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    print(f"\nMissing Columns: {missing_cols}")
    sys.exit()

# 5. Converted to Numeric & Drop NaNs
df[required_cols] = df[required_cols].apply(pd.to_numeric, errors='coerce')
df = df.dropna(subset=required_cols)

# Removing Outliers using IQR Method
Q1 = df[required_cols].quantile(0.3)
Q3 = df[required_cols].quantile(0.7)
IQR = Q3 - Q1

# Filtering out rows where any value is an outlier
df = df[~((df[required_cols] < (Q1 - 1.5 * IQR)) | (df[required_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

print(f"\nData shape after outlier removal: {df.shape}")

# 6. Plotted Interactive Correlation Heatmap
fig = px.imshow(
    df[required_cols].corr(),
    text_auto=True,
    title="Correlation Heatmap (Interactive)",
    color_continuous_scale='RdBu_r'
)
fig.write_html("correlation_heatmap.html")
fig.show()

# 7. Plotted Interactive Scatter Plots
features = ['moisture', 'co_efficiency', 'hot_blast_temp', 
            'slag_rate', 'flame_temp', 'top_pressure']

for feature in features:
    fig = px.scatter(
        df, x=feature, y='fuel_rate', 
        trendline="ols",
        title=f'{feature.replace("_", " ").title()} vs Fuel Rate',
        labels={feature: feature.replace('_', ' ').title(), 'fuel_rate': 'Fuel Rate'},
        color_discrete_sequence=['dodgerblue']
    )
    fig.show()

# 8. Train-Test Split
X = df[features]
y = df['fuel_rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.28, random_state=41)

# 9. Comparing Multiple Models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({'Model': name, 'MSE': mse, 'R2 Score': r2})
    print(f"{name} - MSE: {mse:.2f}, R2: {r2:.2f}")

# Printed results as DataFrame
results_df = pd.DataFrame(results)
print("\nModel Evaluation Results:\n", results_df)

# 10. Visualized Model Comparison
fig = px.bar(
    results_df, x='Model', y='R2 Score',
    color='Model', text='R2 Score',
    title='Model Performance Comparison (R² Score)',
    labels={'R2 Score': 'R² Score'}
)
fig.show()

# 11. Choose the Best Model 
best_model = models['Random Forest']

# 12. Feature Importance 
importances = pd.Series(best_model.feature_importances_, index=features)
plt.figure(figsize=(8, 5))
importances.sort_values().plot(kind='barh', color='teal')
plt.title('Feature Importances for Fuel Rate Prediction')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig("feature_importance.png")
plt.show()

# 13. Fuel Rate Prediction for New Input
def get_user_input():
    inputs = {}
    for feature in features:
        while True:
            try:
                value = float(input(f"Enter {feature.replace('_', ' ')}: "))
                inputs[feature] = value
                break
            except ValueError:
                print("Invalid input. Please enter a numeric value.")
    return pd.DataFrame([inputs])

# Commented for notebook to avoid manual input
# new_data = get_user_input()
# predicted_fuel_rate = best_model.predict(new_data)
# print("\nPredicted Fuel Rate for New Data:", predicted_fuel_rate[0])

# 14. Saved the Trained Model
model_path = 'fuel_rate_rf_model.pkl'
joblib.dump(best_model, model_path)
print(f"\nModel saved as '{model_path}'")

# Saved cleaned and processed DataFrame to CSV for Power BI
df.to_csv('cleaned_blast_furnace_data.csv', index=False)
print("Cleaned data saved as 'cleaned_blast_furnace_data.csv'")
