In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
file_path = "Carbon Emission.csv"  # Update path if needed
df = pd.read_csv(file_path)

# Define features (X) and target variable (y)
X = df.drop(columns=["CarbonEmission"])
y = df["CarbonEmission"]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns

# Fill missing categorical values with mode (most frequent value)
for col in categorical_cols:
    X[col] = X[col].fillna(X[col].mode()[0])  # Fix for Pandas warning

# One-hot encode categorical variables
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # Fix for OneHotEncoder
X_encoded = encoder.fit_transform(X[categorical_cols])

# Convert to DataFrame
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out())

# Reset index to align numerical and encoded data before concatenation
X_numerical = X[numerical_cols].reset_index(drop=True)
X_encoded_df = X_encoded_df.reset_index(drop=True)

# Combine numerical and encoded categorical data
X_final = pd.concat([X_numerical, X_encoded_df], axis=1)

# Standardize the data for better performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)

# Split into 80% train and 20% test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize Linear Regression Model
lr = LinearRegression()

# Train the model
lr.fit(X_train, y_train)

# Predict on test data
y_pred_lr = lr.predict(X_test)

# Calculate RMSE and R² Score
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

# Print evaluation metrics
print(f"RMSE: {rmse_lr}")
print(f"R² Score: {r2_lr}")


RMSE: 261.61410875518663
R² Score: 0.9341719789400728
