In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv(
    "/content/drive/MyDrive/finalPortfolioDatasets./Cleaned_dataset_for_classification_task.csv",
    encoding="latin1"
)

In [None]:
df.info()

In [None]:
# Features
X = df[['Year', 'Population(2022)', 'Area', '% of World', 'Density(km2)',
                             'CO2_per_capita', 'Emission_Class']]

# Target
y = df['CO2 emission (Tons)']


In [None]:
#======================================================================
# PRIMARY MODEL 1: lINEAR REGRESSION
# ======================================================================

print("="*70)
print("PRIMARY MODEL 1: lINEAR REGRESSION")
print("="*70)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [None]:
from sklearn.linear_model import Ridge

# Initialize Ridge Regression model
ridge = Ridge(alpha=1.0, random_state=42)  # alpha is the regularization strength

# Train the model
ridge.fit(X_train, y_train)

# Make predictions
y_pred_ridge = ridge.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
# Metrics Function FOR BOTH RANDOM FOREST AND DTR
def regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {"MSE": mse, "RMSE": rmse, "MAE": mae, "R2": r2}


In [None]:
metrics_lr = regression_metrics(y_test, y_pred_ridge)
print(metrics_lr)

In [None]:
# Using the trained Decision Tree (or Linear Regression)

import pandas as pd

# Create a sanity check dataframe
sanity_check = pd.DataFrame({
    'Country': df.loc[y_test.index, 'Country'],
    'Year': X_test['Year'],
    'Actual_CO2': y_test,
    'Predicted_CO2': y_pred_ridge
})

import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_ridge, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # perfect line
plt.xlabel('Actual CO2 (Tons)')
plt.ylabel('Predicted CO2 (Tons)')
plt.title('Random forest: Predicted vs Actual CO2')
plt.show()


In [None]:
#======================================================================
# PRIMARY MODEL 2: DESICION TREE
# ======================================================================

print("="*70)
print("PRIMARY MODEL 2: DESICION TREE")
print("="*70)


In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train, y_train)  # no scaling needed


In [None]:
# Decision Tree Predictions
y_pred_dtr = dtr.predict(X_test)

metrics_dtr = regression_metrics(y_test, y_pred_dtr)

metrics_dtr

In [None]:
# Using the trained Decision Tree (or Linear Regression)
y_pred = dtr.predict(X_test)  # or lr.predict(X_test_scaled) for Linear Regression
import pandas as pd

# Create a sanity check dataframe
sanity_check = pd.DataFrame({
    'Country': df.loc[y_test.index, 'Country'],
    'Year': X_test['Year'],
    'Actual_CO2': y_test,
    'Predicted_CO2': y_pred
})

import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # perfect line
plt.xlabel('Actual CO2 (Tons)')
plt.ylabel('Predicted CO2 (Tons)')
plt.title('Random: Predicted vs Actual CO2')
plt.show()



In [None]:
comparison = pd.DataFrame({
    "Metric": ["MSE", "RMSE", "MAE", "R2"],
    "Linear Regression": list(metrics_lr.values()),
    "Decision Tree": list(metrics_dtr.values())
})

comparison

R² ≈ 0.002 → Linear Regression explains almost none of the variance.

RMSE/MAE very small → but scale might be small if your data is normalized or standardized.

Clearly, Linear Regression is not capturing the relationships between features and CO2 emissions.

R² ≈ 0.91 → Decision Tree explains ~91% of variance → very strong.

MSE / RMSE extremely low → predictions are very close to true values.

Decision Tree is much better than Linear Regression on this dataset.

In [None]:
# ======================================================================
# FEATURE SELECTION
# ======================================================================

print("="*70)
print("FEATURE SELECTION")
print('='*70)

In [None]:
# ______________________________________________________________________
#
# LINEAR REGRESSION FEATURE SELECTION
# ______________________________________________________________________

print("_"*70)
print("\nLINEAR REGRESSION FEATURE SELECTION")
print('_'*70)

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

# Define features and target
# Use only numeric independent features (safe)
features = ['Year', 'Population(2022)', 'Area', '% of World', 'Density(km2)', "Emission_Class","CO2_per_capita"]
target = 'CO2 emission (Tons)'

X = df[features]
y = df[target]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Ridge Regression
ridge = Ridge(alpha=1.0, random_state=42)

# Initialize RFE with Ridge as estimator
rfe_ridge = RFE(estimator=ridge, n_features_to_select=5)
rfe_ridge.fit(X_train, y_train)

# Create feature ranking table
feature_ranking_ridge = pd.DataFrame({
    "Feature": X_train.columns,
    "Ranking": rfe_ridge.ranking_,
    "Selected": rfe_ridge.support_
}).sort_values("Ranking")

print("Ridge Regression Feature Ranking (via RFE):\n", feature_ranking_ridge)


In [None]:
plt.figure(figsize=(9, 5))
sns.barplot(
    x="Ranking",
    y="Feature",
    data=feature_ranking_ridge
)

plt.title("Feature Ranking using Linear Regression")
plt.xlabel("Importance (Absolute Coefficient Value)")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

Althtough, the ranking shows that co2_per_cap is a better feature than Area , because there is a high chance of data leakage from co2_per_capital, since CO2 emission = population * co2_per_cap so we will be ommiting the co2_per_captial and will be using the next best feature i.e. Density and since emission_class was also indirectly created using CO2 emission it shall also be replaced by the next closest features i.e population

In [None]:
# ______________________________________________________________________
#
# DECISION TREE FEATURE SELECTION
# ______________________________________________________________________

print("_"*70)
print("\nDECISION TREE FEATURE SELECTION")
print('_'*70)

In [None]:
# Decision Tree RFE
dtr = DecisionTreeRegressor(random_state=42)
rfe_dt = RFE(estimator=dtr, n_features_to_select=5)  # select top 3 features
rfe_dt.fit(X_train, y_train)

selected_features_dt = X_train.columns[rfe_dt.support_]
# feature ranking table
feature_ranking_dt = pd.DataFrame({
    "Feature": X.columns,
    "Ranking": rfe_dt.ranking_,
    "Selected": rfe_dt.support_
}).sort_values("Ranking")


print("\nDecision Tree Selected Features:", list(selected_features_dt))
print("\nDecision Tree Feature Ranking:\n", feature_ranking_dt)

In [None]:
# viualizing the ranking

plt.figure(figsize=(9, 5))
sns.barplot(
    x="Ranking",
    y="Feature",
    data=feature_ranking_dt
)

plt.title("Feature Ranking using RFE (Logistic Regression)")
plt.xlabel("Feature Rank (Lower = More Important)")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


Althtough, the ranking shows that co2_per_cap is a better feature than Area , because there is a high chance of data leakage from co2_per_capital, since CO2 emission = population * co2_per_cap so we will be ommiting the co2_per_captial and will be using the next best feature i.e. Area.

In [None]:
selected_features = ['Year', 'Population(2022)', '% of World', 'Density(km2)', 'Area']

In [None]:
# ======================================================================
# HYPERPARAMETERE TUNNING
# ======================================================================

print("="*70)
print("HYPERPARAMETERE TUNNING")
print('='*70)

In [None]:
# ______________________________________________________________________
#
# DECISION TREE HYPERPARAMETER TUNNING
# ______________________________________________________________________

print("_"*70)
print("\nDECISION TREE HYPERPARAMETER TUNNING")
print('_'*70)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error


In [None]:
# Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)

dt_params = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# GridSearchCV for Decision Tree
dt_grid = GridSearchCV(
    estimator=dt,
    param_grid=dt_params,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

dt_grid.fit(X_train, y_train)

print("Decision Tree Best Hyperparameters:", dt_grid.best_params_)
print("Decision Tree Best CV Score (MSE):", -dt_grid.best_score_)


In [None]:
# ______________________________________________________________________
#
# LINEAR REGRESSION HYPERPARAMETER TUNNING
# ______________________________________________________________________

print("_"*70)
print("\nLINEAR REGRESSION HYPERPARAMETER TUNNING")
print('_'*70)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Features
X = df[['Year', 'Population(2022)', 'Area', '% of World', 'Density(km2)']]

# Target
y = df['CO2 emission (Tons)']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)



In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define Ridge model
ridge = Ridge(random_state=42)

# Define hyperparameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]  # regularization strengths to try
}

# Initialize GridSearchCV
ridge_grid = GridSearchCV(
    estimator=ridge,
    param_grid=param_grid,
    scoring='r2',
    cv=5,                # 5-fold cross-validation
    n_jobs=-1            # use all CPU cores
)

# Fit GridSearchCV on training data
ridge_grid.fit(X_train, y_train)

# Best hyperparameter
print("Best alpha:", ridge_grid.best_params_)

# Best score (mean R² across CV folds)
print("Best R² Score:", ridge_grid.best_score_)

# Make predictions on test set using best model
best_ridge = ridge_grid.best_estimator_
y_pred_ridge = best_ridge.predict(X_test)


In [None]:
# ======================================================================
# PRIMARY MODELS AFTERS TUNNING AND FEATURE SELECTION
# ======================================================================

print("="*70)
print("PRIMARY MODELS AFTERS TUNNING AND FEATURE SELECTION")
print('='*70)

In [None]:
from sklearn.model_selection import train_test_split
target = 'CO2 emission (Tons)'
X = df
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# Fit GridSearch using selected features
ridge_grid.fit(X_train[selected_features], y_train)

# Best Random Forest model is selected automatically
ridge_best = ridge_grid.best_estimator_

# Predictions on test data
y_pred_ridge = ridge_best.predict(X_test[selected_features])


In [None]:
print(ridge_best)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

ridge_metrics = {
    "MSE": mean_squared_error(y_test, y_pred_ridge),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_ridge)),
    "MAE": mean_absolute_error(y_test, y_pred_ridge),
    "R2": r2_score(y_test, y_pred_ridge)
}

ridge_metrics


In [None]:
from sklearn.tree import DecisionTreeRegressor

dt_final = DecisionTreeRegressor(
    max_depth=None,
    min_samples_leaf=5,
    min_samples_split=10,
    random_state=42
)

dt_final.fit(X_train[selected_features], y_train)

y_pred_dt = dt_final.predict(X_test[selected_features])


In [None]:
dt_metrics = {
    "MSE": mean_squared_error(y_test, y_pred_dt),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_dt)),
    "MAE": mean_absolute_error(y_test, y_pred_dt),
    "R2": r2_score(y_test, y_pred_dt)
}

dt_metrics


In [None]:
comparison = pd.DataFrame({
    "Metric": ["MSE", "RMSE", "MAE", "R2"],
    "Linear Regression": list(ridge_metrics.values()),
    "Decision Tree": list(dt_metrics.values())
})

comparison


In [None]:
# ======================================================================
# NEURAL NETWORK FOR REGRESSION TASK
# ======================================================================

print('='*70)
print("NEURAL NETWORK FOR REGRESSION TASK")
print('='*70)


In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Features and target
features = ['Year', 'Population(2022)', '% of World', 'Density(km2)', 'Area']
X = df[features]
y = df['CO2 emission (Tons)']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Neural Network pipeline
nn_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # always scale inputs for neural networks
    ('nn', MLPRegressor(
        hidden_layer_sizes=(100, 50),  # 2 hidden layers: 100 and 50 neurons
        activation='relu',             # non-linear activation
        solver='adam',                 # optimizer
        max_iter=1000,
        random_state=42
    ))
])

# Fit the model
nn_pipeline.fit(X_train, y_train)

# Predictions
y_pred_nn = nn_pipeline.predict(X_test)

In [None]:
# nn on test set
# Mean Squared Error
mse = mean_squared_error(y_test, y_pred_nn)

# Root Mean Squared Error
rmse = np.sqrt(mse)

# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred_nn)

# R² score
r2 = r2_score(y_test, y_pred_nn)

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")


In [None]:
# NN on training set
y_train_pred = nn_pipeline.predict(X_train)

# Mean Squared Error
mse = mean_squared_error(y_train, y_train_pred)

# Root Mean Squared Error
rmse = np.sqrt(mse)

# Mean Absolute Error
mae = mean_absolute_error(y_train, y_train_pred)

# R² score
r2 = r2_score(y_train, y_train_pred)

print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R²: {r2:.4f}")

