In [None]:
# 1. Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# 2. Load and Inspect Data
def load_data(filepath):
    """Load and preprocess raw data"""
    df = pd.read_csv(filepath, sep=';', 
                     parse_dates={'datetime': ['Date', 'Time']},
                     infer_datetime_format=True,
                     low_memory=False,
                     na_values=['?'])
    return df

df = load_data('household_power_consumption.txt')

print(f"Data shape: {df.shape}")
print("\nMissing values per column:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)

# 3. Data Cleaning
def clean_data(df):
    """Handle missing values and convert data types"""
    df_clean = df.dropna()
    
    numeric_cols = df.columns[1:]
    df_clean[numeric_cols] = df_clean[numeric_cols].apply(pd.to_numeric)
    
    df_clean = df_clean.set_index('datetime')
    return df_clean

df_clean = clean_data(df)

# 4. Exploratory Data Analysis (EDA)
def plot_consumption_trends(df):
    """Visualize power consumption patterns"""
    daily_df = df.resample('D').mean()
    
    plt.figure(figsize=(15, 8))
    
    plt.subplot(2, 1, 1)
    plt.plot(daily_df.index, daily_df['Global_active_power'])
    plt.title('Daily Average Active Power Consumption')
    plt.ylabel('kW')
    
    plt.subplot(2, 1, 2)
    for i in range(1, 4):
        plt.plot(daily_df.index, daily_df[f'Sub_metering_{i}'], 
                label=f'Sub-meter {i}')
    plt.legend()
    plt.ylabel('Watt-hours')
    
    plt.tight_layout()
    plt.show()

plot_consumption_trends(df_clean)

def visual_analysis(df):
    """
    Performs visual analysis on the household electric power consumption dataset.

    Args:
        df (pd.DataFrame): The input DataFrame containing the dataset.
    """
    # 1. Time Series
    plt.figure(figsize=(12, 6))
    df_daily = df.resample('D')['Global_active_power'].mean()
    plt.plot(df_daily.index, df_daily)
    plt.title('Daily Average Global Active Power')
    plt.xlabel('Time')
    plt.ylabel('Daily Average Global Active Power (kW)')
    plt.show()

    # 2. Box Plots
    df['hour'] = df.index.hour
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='hour', y='Global_active_power', data=df)
    plt.title('Global Active Power by Hour of Day')
    plt.xlabel('Hour of Day')
    plt.ylabel('Global Active Power (kW)')
    plt.show()

    # 3. Correlation Matrix
    plt.figure(figsize=(10, 8))
    corr_matrix = df[['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity',
                       'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()

    descriptive_stats = df[['Global_active_power', 'Voltage', 'Global_intensity',
                           'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']].describe()
    print(descriptive_stats)

    missing_percentage = (df.isnull().sum() / len(df)) * 100
    print("\nMissing Value Percentages:")
    print(missing_percentage)

visual_analysis(df_clean)

# 5. Feature Engineering
def create_features(df):
    """Create time-based features"""
    df_feat = df.copy()
    
    df_feat['hour'] = df_feat.index.hour
    df_feat['day_of_week'] = df_feat.index.dayofweek
    df_feat['month'] = df_feat.index.month
    
    df_feat['lag_24h'] = df_feat['Global_active_power'].shift(24)
    df_feat['lag_168h'] = df_feat['Global_active_power'].shift(168)
    
    df_feat = df_feat.dropna()
    
    return df_feat

df_features = create_features(df_clean)

# 6. Modeling
X = df_features.drop('Global_active_power', axis=1)
y = df_features['Global_active_power']

test_size = 24 * 30  # 30 days of hourly data
X_train, X_test = X[:-test_size], X[-test_size:]
y_train, y_test = y[:-test_size], y[-test_size:]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    results[name] = {
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }
    
    plt.figure(figsize=(10, 4))
    plt.plot(y_test.index, y_test, label='Actual')
    plt.plot(y_test.index, y_pred, label='Predicted', alpha=0.7)
    plt.title(f'{name} - Actual vs Predicted')
    plt.legend()
    plt.show()

# 7. Results Analysis
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print(results_df)

if 'Random Forest' in models:
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': models['Random Forest'].feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Random Forest Feature Importance')
    plt.show()

# 8. Save Outputs
results_df.to_csv('model_results.csv')
feature_importance.to_csv('feature_importance.csv', index=False)