In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats

def load_data(file_path):
    try:
        data = pd.read_csv(file_path, sep=';', na_values=['?'], nrows=1000)
        data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
        data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S').dt.time
        return data
    except Exception as e:
        print(f"An error occurred while loading the dataset: {e}")

def eda_before_preprocessing(data):
    print("Before Preprocessing:")
    print("---------------------")
    # Histogram of Global Active Power
    fig = px.histogram(data, x='Global_active_power')
    fig.update_layout(title='**Before Preprocessing: Histogram of Global Active Power**', xaxis_title='Global Active Power', yaxis_title='Count')
    fig.show()
    # Boxplot of Global Active Power
    fig = px.box(data, x='Global_active_power')
    fig.update_layout(title='**Before Preprocessing: Boxplot of Global Active Power**', xaxis_title='Global Active Power')
    fig.show()
    skewness = data['Global_active_power'].skew()
    print(f"Before Preprocessing: Skewness of Global Active Power: {skewness}")
    z_scores = np.abs(stats.zscore(data['Global_active_power']))
    outlier_count = np.sum(z_scores > 3)
    print(f"Before Preprocessing: Outlier count in Global Active Power: {outlier_count}")

def preprocess_data(data):
    try:
        data = data.dropna()
        data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
        data['Time'] = pd.to_datetime(data['Time'], format='%H:%M:%S').dt.time
        data['Hour'] = [t.hour for t in data['Time']]
        data['Minute'] = [t.minute for t in data['Time']]
        data['Day'] = data['Date'].dt.day
        data['Month'] = data['Date'].dt.month
        data['Year'] = data['Date'].dt.year
        data['Daily_Average'] = data.groupby('Date')['Global_active_power'].transform('mean')
        data['Peak_Hour'] = np.where((data['Hour'] >= 17) & (data['Hour'] <= 20), 1, 0)
        
        # Apply Box-Cox transformation to Global_active_power
        data['Global_active_power'], _ = stats.boxcox(data['Global_active_power'] + 1)
        
        return data
    except Exception as e:
        print(f"An error occurred while preprocessing the data: {e}")

def eda_after_preprocessing(data):
    print("After Preprocessing:")
    print("--------------------")
    
    # Correlation Matrix
    numeric_data = data.select_dtypes(include=['int64', 'float64'])
    corr_matrix = numeric_data.corr()
    fig = px.imshow(corr_matrix, text_auto=True)
    fig.update_layout(title='After Preprocessing: Correlation Matrix of Numeric Features')
    fig.show()
    
    # Distribution of Categorical Features
    categorical_cols = ['Hour', 'Day', 'Month', 'Year']
    for col in categorical_cols:
        if col in data.columns:
            fig = px.histogram(data, x=col)
            fig.update_layout(title=f'After Preprocessing: Distribution of {col}', xaxis_title=col, yaxis_title='Count')
            fig.show()
            
    # Boxplot of Global Active Power by Hour
    if 'Hour' in data.columns and 'Global_active_power' in data.columns:
        fig = px.box(data, x='Hour', y='Global_active_power')
        fig.update_layout(title='After Preprocessing: Boxplot of Global Active Power by Hour', xaxis_title='Hour', yaxis_title='Global Active Power')
        fig.show()
        
    # Heatmap of average Global_active_power by Hour and Day
    heatmap_data = data.pivot_table(index='Hour', columns='Day', values='Global_active_power', aggfunc='mean')
    fig = px.imshow(heatmap_data, text_auto=True, aspect='auto')
    fig.update_layout(title='After Preprocessing: Heatmap of Average Global Active Power by Hour and Day', width=1000, height=600)
    fig.show()
    
    # Calculate skewness after preprocessing
    skewness_after = data['Global_active_power'].skew()
    print(f"After Preprocessing: Skewness of Global Active Power: {skewness_after}")
# Calculate outliers after preprocessing
    z_scores_after = np.abs(stats.zscore(data['Global_active_power']))
    outlier_count_after = np.sum(z_scores_after > 3)
    print(f"After Preprocessing: Outlier count in Global Active Power: {outlier_count_after}")

def scale_data(data):
    try:
        scaler = StandardScaler()
        data[['Global_reactive_power', 'Voltage', 'Global_intensity']] = scaler.fit_transform(data[['Global_reactive_power', 'Voltage', 'Global_intensity']])
        return data
    except Exception as e:
        print(f"An error occurred while scaling the data: {e}")

def train_models(X_train, y_train):
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'Neural Network': MLPRegressor(max_iter=1000)
    }
    param_grids = {
        'Linear Regression': {},
        'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]},
        'Gradient Boosting': {'n_estimators': [100, 200, 300], 'learning_rate': [0.1, 0.05, 0.01]},
        'Neural Network': {'hidden_layer_sizes': [(50, 50), (100, 100)], 'alpha': [0.0001, 0.001, 0.01]}
    }
    for name, model in models.items():
        if param_grids[name]:
            grid_search = GridSearchCV(model, param_grids[name], cv=5)
            grid_search.fit(X_train, y_train)
            yield name, grid_search.best_estimator_
        else:
            model.fit(X_train, y_train)
            yield name, model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test,y_pred)
    return rmse, mae, r2

def main():
    file_path = r'C:\Users\Administrator\Desktop\Miniproj\venv\Energy\household_power_consumption.txt'
    data = load_data(file_path)
    print("Dataset loaded successfully.")
    eda_before_preprocessing(data)
    data = preprocess_data(data)
    eda_after_preprocessing(data)
    data = scale_data(data)
    X = data.drop(['Global_active_power', 'Date', 'Time'], axis=1)
    y = data['Global_active_power']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model_performance = []
    for name, model in train_models(X_train, y_train):
        rmse, mae, r2 = evaluate_model(model, X_test, y_test)
        model_performance.append({'Model': name, 'RMSE': rmse, 'MAE': mae, 'R2': r2})
        y_pred = model.predict(X_test)
        fig = px.scatter(x=y_test, y=y_pred)
        fig.update_layout(title=f'Actual vs Predicted Values for {name}', xaxis_title='Actual Values', yaxis_title='Predicted Values')
        fig.add_scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', name='Perfect Prediction')
        fig.show()
        print(f'Model: {name}')
        print(f'RMSE: {rmse}')
        print(f'MAE: {mae}')
        print(f'R²: {r2}')
        print('---')
    model_performance_df = pd.DataFrame(model_performance)
    print(model_performance_df)
    best_model = model_performance_df.loc[model_performance_df['R2'].idxmax()]
    print(f'Best-performing model: {best_model["Model"]}')

if __name__ == "__main__":
    main()
