In [1]:
import pandas as pd
import numpy as np
import io
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder


In [10]:
import pandas as pd

# ONE-LINER: Read the ; file and immediately save as real CSV
pd.read_csv('datasets/household_power_consumption.txt', sep=';', low_memory=False) \
  .to_csv('datasets/household_power_consumption.csv', index=False)

print("Done! Your file is now converted to real CSV → household_power_consumption.csv")

Done! Your file is now converted to real CSV → household_power_consumption.csv


In [4]:

# ==========================================
# 1. LOAD DATASETS (Using pd.read_csv)
# ==========================================

# READ CSV 1 (Standard)
df_carbon = pd.read_csv(r"E:\Carbon_footprint\training_models\datasets\Carbon_Emission.csv")

# READ CSV 2 (Semicolon separated)
df_energy = pd.read_csv(r"E:\Carbon_footprint\training_models\datasets\household_power_consumption.csv")

print(">>> Datasets Loaded Successfully via read_csv")
print(f"Carbon Data Columns: {df_carbon.columns.tolist()}")
print(f"Energy Data Columns: {df_energy.columns.tolist()}")
print("-" * 50)


>>> Datasets Loaded Successfully via read_csv
Carbon Data Columns: ['Body Type', 'Sex', 'Diet', 'How Often Shower', 'Heating Energy Source', 'Transport', 'Vehicle Type', 'Social Activity', 'Monthly Grocery Bill', 'Frequency of Traveling by Air', 'Vehicle Monthly Distance Km', 'Waste Bag Size', 'Waste Bag Weekly Count', 'How Long TV PC Daily Hour', 'How Many New Clothes Monthly', 'How Long Internet Daily Hour', 'Energy efficiency', 'Recycling', 'Cooking_With', 'CarbonEmission']
Energy Data Columns: ['Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3']
--------------------------------------------------


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score


# 2. Separate Features and Target
target_col = 'CarbonEmission'
X = df_carbon.drop(columns=[target_col])
y = df_carbon[target_col]

# 3. Identify Column Types
# select_dtypes returns a subset dataframe, we get the column names
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# 4. Create Preprocessing Pipeline
# Numerical: Fill missing values with mean, then Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical: Fill missing with 'missing', then OneHotEncode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# 5. Create Model Pipeline
carbon_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 6. Split and Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
carbon_model.fit(X_train, y_train)

# 7. Evaluate
y_pred = carbon_model.predict(X_test)
print(f"   Carbon Model R2 Score: {r2_score(y_test, y_pred):.4f}")
print(f"   Carbon Model RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")



   Carbon Model R2 Score: 0.9216
   Carbon Model RMSE: 285.4725


In [None]:
# ==========================================
# MODEL 2: HOUSEHOLD POWER CONSUMPTION
# ==========================================
print("\n[2/2] Processing Energy Consumption Model...")



# Combine Date and Time into a datetime object to extract features
df_energy['datetime'] = pd.to_datetime(df_energy['Date'] + ' ' + df_energy['Time'], dayfirst=True)

# Feature Engineering from DateTime
df_energy['hour'] = df_energy['datetime'].dt.hour
df_energy['month'] = df_energy['datetime'].dt.month
df_energy['day_of_week'] = df_energy['datetime'].dt.dayofweek

# Drop non-numeric columns used for extraction
df_energy = df_energy.drop(columns=['Date', 'Time', 'datetime'])

# 3. Handle Missing Values
# This dataset often has missing rows. We will drop them for speed or impute.
df_energy = df_energy.dropna()

# 4. Define Target
# We will predict 'Global_active_power' based on other metrics + time features
target_energy = 'Global_active_power'
X_e = df_energy.drop(columns=[target_energy])
y_e = df_energy[target_energy]

# 5. Split and Train
# Using a smaller test size or temporal split is better for time series, 
# but random split is used here for general regression demonstration.
X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X_e, y_e, test_size=0.2, random_state=42)

# Initialize Regressor (No OneHot needed as all remaining inputs are numeric)
energy_model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)

print("   Training Energy Model (this might take a moment)...")
energy_model.fit(X_train_e, y_train_e)

# 6. Evaluate
y_pred_e = energy_model.predict(X_test_e)
print(f"   Energy Model R2 Score: {r2_score(y_test_e, y_pred_e):.4f}")
print(f"   Energy Model RMSE: {np.sqrt(mean_squared_error(y_test_e, y_pred_e)):.4f}")


print(f"   Error training Energy model: {e}")
# Fallback: If separator was actually comma
print("   Tip: If the error mentions columns not found, check if the CSV delimiter is ',' or ';'")

print("\n--- Training Complete ---")


[2/2] Processing Energy Consumption Model...


KeyError: 'Date'

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

# ==========================================
# 1. LOAD DATASETS
# ==========================================
print("--- Loading Datasets ---")

# PATHS (Adjusted based on your input)
carbon_path = r"E:\Carbon_footprint\training_models\datasets\Carbon_Emission.csv" # Added .csv
energy_path = r"E:\Carbon_footprint\training_models\datasets\household_power_consumption.csv"

# LOAD CARBON (Standard CSV)
try:
    df_carbon = pd.read_csv(carbon_path)
    print("1. Carbon Data Loaded Successfully")
except FileNotFoundError:
    print(f"Error: Carbon file not found at {carbon_path}")

# LOAD ENERGY (Fix: Uses semicolon separator and '?' for missing values)
try:
    df_energy = pd.read_csv(energy_path, sep=';', low_memory=False, na_values=['?', 'nan'])
    print("2. Energy Data Loaded Successfully")
    # verification
    if 'Date' not in df_energy.columns:
        # Fallback if the specific file is actually comma separated
        print("   Warning: Semicolon separator failed. Trying comma...")
        df_energy = pd.read_csv(energy_path, sep=',')
except FileNotFoundError:
    print(f"Error: Energy file not found at {energy_path}")

print("-" * 50)

# ==========================================
# MODEL 1: CARBON EMISSION
# ==========================================
if 'df_carbon' in locals():
    print("\n[1/2] Training Carbon Emission Model...")
    
    # Define features and target
    target_col = 'CarbonEmission'
    X = df_carbon.drop(columns=[target_col])
    y = df_carbon[target_col]

    # Preprocessing
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Pipeline
    carbon_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])

    # Train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    carbon_model.fit(X_train, y_train)
    
    # Evaluate
    print(f"   Carbon R2 Score: {carbon_model.score(X_test, y_test):.4f}")

# ==========================================
# MODEL 2: HOUSEHOLD POWER CONSUMPTION
# ==========================================
if 'df_energy' in locals():
    print("\n[2/2] Processing Energy Consumption Model...")

    try:
        # 1. Date Parsing
        # Combining Date and Time. 'dayfirst=True' is crucial for this dataset format (dd/mm/yyyy)
        df_energy['datetime'] = pd.to_datetime(df_energy['Date'] + ' ' + df_energy['Time'], dayfirst=True)

        # 2. Feature Engineering
        df_energy['hour'] = df_energy['datetime'].dt.hour
        df_energy['month'] = df_energy['datetime'].dt.month
        df_energy['day_of_week'] = df_energy['datetime'].dt.dayofweek

        # 3. Clean up
        df_energy = df_energy.drop(columns=['Date', 'Time', 'datetime'])
        df_energy = df_energy.dropna() # Drop rows with missing values

        # 4. Define Target (Global_active_power)
        target_energy = 'Global_active_power'
        X_e = df_energy.drop(columns=[target_energy])
        y_e = df_energy[target_energy]

        # 5. Train
        X_train_e, X_test_e, y_train_e, y_test_e = train_test_split(X_e, y_e, test_size=0.2, random_state=42)
        
        energy_model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
        print("   Training Energy Model (this may take a minute)...")
        energy_model.fit(X_train_e, y_train_e)

        # 6. Evaluate
        y_pred_e = energy_model.predict(X_test_e)
        print(f"   Energy Model R2 Score: {r2_score(y_test_e, y_pred_e):.4f}")
        print(f"   Energy Model RMSE: {np.sqrt(mean_squared_error(y_test_e, y_pred_e)):.4f}")

    except KeyError as e:
        print(f"   KeyError: {e}. Check if the columns loaded correctly.")
        print(f"   Current Columns: {df_energy.columns.tolist()}")
    except Exception as e:
        print(f"   An error occurred: {e}")

print("\n--- Training Complete ---")

--- Loading Datasets ---
1. Carbon Data Loaded Successfully
2. Energy Data Loaded Successfully
   Detected Columns: ['Date', 'Time', 'Global_active_power', 'Global_reactive_power'] ...
--------------------------------------------------

[1/2] Training Carbon Emission Model...
   Carbon Model R2 Score: 0.9199

[2/2] Processing Energy Consumption Model...
   Parsing dates...
   Training Energy Model...


KeyboardInterrupt: 