In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
# Load dataset
df = pd.read_csv('energy_consumption.csv')

# Display first few rows
print(df.head())

# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Basic statistics
print("Descriptive Statistics:\n", df.describe())


In [None]:
# Convert categorical features to numeric
df = pd.get_dummies(df, drop_first=True)

# Define Features and Target
X = df.drop('energy_consumed', axis=1)
y = df['energy_consumed']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
model = RandomForestRegressor(random_state=42)

# 5-fold cross-validation using R² score
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print("Cross-validation Scores (R²):", scores)
print("Mean R² Score:", scores.mean())


In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best R² Score (Grid Search):", grid_search.best_score_)


In [None]:
# Fit model with default parameters
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get and sort feature importances
feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Display and plot
print("Feature Importance:\n", feature_importance)

plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.title('Feature Importance Analysis')
plt.tight_layout()
plt.show()


In [None]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
print("Ridge R² Score (on Test Set):", ridge.score(X_test, y_test))

# Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
print("Lasso Coefficients:\n", lasso.coef_)


In [None]:
# Use best model from Grid Search
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test)

# Evaluate model performance
print("Final R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
