<a href="https://colab.research.google.com/github/rodwol/new_LR/blob/main/multivariate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# 📘 Toxic Waste Risk Prediction: Linear Regression Model
# ✅ SECTION 1: IMPORT LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# ✅ SECTION 2: LOAD DATA
df = pd.read_csv('https://raw.githubusercontent.com/rodwol/new_LR/main/linear_regression_model/summative/linear_regression/toxic_waste_dataset_cleaned.csv')
df.head()

# ✅ SECTION 3: DROP IRRELEVANT COLUMNS INCLUDING TEXT FIELDS
drop_cols = ['ID', 'Title', 'Summary', 'Author/Source', 'URL', 'Location', 'Date', 'Environmental Impact']
df.drop(columns=drop_cols, inplace=True)

# ✅ SECTION 4: VISUALIZATION
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

plt.figure(figsize=(6, 4))
sns.histplot(df['Estimated Waste Volume (tons)'], bins=10, kde=True)
plt.title("Distribution of Waste Volume")
plt.show()

# ✅ SECTION 5: FEATURE ENGINEERING
categorical_cols = ['Country', 'Dumping Entity', 'Legal Actions', 'Waste Type']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


# ✅ SECTION 6: STANDARDIZE NUMERIC FEATURES
scaler = StandardScaler()
df[['Latitude', 'Longitude', 'Year']] = scaler.fit_transform(df[['Latitude', 'Longitude', 'Year']])

# ✅ SECTION 7: DEFINE FEATURES & TARGET
X = df.drop('Estimated Waste Volume (tons)', axis=1)
y = df['Estimated Waste Volume (tons)']

# ✅ SECTION 8: TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ SECTION 9: LINEAR REGRESSION MODEL (SGD)
lr_model = SGDRegressor(max_iter=1000, eta0=0.01, learning_rate='invscaling', random_state=42)
lr_model.fit(X_train, y_train)

# ✅ SECTION 10: LOSS CURVE SIMULATION
train_errors = []
test_errors = []

for i in range(1, 301):
    model = SGDRegressor(max_iter=i, tol=None, eta0=0.01, learning_rate='constant', random_state=42)
    model.fit(X_train, y_train)
    train_errors.append(mean_squared_error(y_train, model.predict(X_train)))
    test_errors.append(mean_squared_error(y_test, model.predict(X_test)))

plt.plot(range(1, 301), train_errors, label='Train MSE')
plt.plot(range(1, 301), test_errors, label='Test MSE')
plt.xlabel('Iterations')
plt.ylabel('MSE')
plt.title('Loss Curve for SGD')
plt.legend()
plt.grid(True)
plt.show()

# ✅ SECTION 11: ACTUAL VS PREDICTED
pred_lr = lr_model.predict(X_test)

plt.figure(figsize=(6, 5))
plt.scatter(y_test, pred_lr, color='blue')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Linear Regression: Actual vs Predicted")
plt.grid(True)
plt.show()

# ✅ SECTION 12: COMPARE MODELS
dt_model = DecisionTreeRegressor().fit(X_train, y_train)
rf_model = RandomForestRegressor().fit(X_train, y_train)

mse_lr = mean_squared_error(y_test, pred_lr)
mse_dt = mean_squared_error(y_test, dt_model.predict(X_test))
mse_rf = mean_squared_error(y_test, rf_model.predict(X_test))

print("\n--- Model Comparison (MSE on Test Set) ---")
print(f"Linear Regression: {mse_lr:.2f}")
print(f"Decision Tree: {mse_dt:.2f}")
print(f"Random Forest: {mse_rf:.2f}")

# ✅ SECTION 13: SAVE BEST MODEL
import os
import cloudpickle  # ⬅️ Add this import if not already there
os.makedirs('linear_regression_model/summative/API', exist_ok=True)

best_model = rf_model if mse_rf < mse_dt and mse_rf < mse_lr else dt_model

# Save using cloudpickle (robust across Python versions)
with open('linear_regression_model/summative/API/best_model.pkl', 'wb') as f:
    cloudpickle.dump(best_model, f)

print("\n✅ Best model saved to '../API/best_model.pkl'")

# ✅ SECTION 14: DEFINE PREDICTION FUNCTION
def make_prediction(input_dict):
    model = joblib.load('linear_regression_model/summative/API/best_model.pkl')
    input_array = np.array([list(input_dict.values())])
    prediction = model.predict(input_array)
    return prediction[0]


# ✅ SECTION 15: SAMPLE PREDICTION
sample_input = {
    'Country': 1,
    'Latitude': 0.34,
    'Longitude': -0.23,
    'Dumping Entity': 2,
    'Legal Actions': 1,
    'Year': 0.25,
    'Waste Type': 3
}

predicted_value = make_prediction(sample_input)
print(f"\nSample predicted waste volume: {predicted_value:.2f} tons")

ModuleNotFoundError: No module named 'pandas'