In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import Ridge

In [2]:
df = pd.read_csv("cleaneddata.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 17 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Country   995 non-null    object 
 1   ISO3      995 non-null    object 
 2   Industry  995 non-null    object 
 3   Gas_Type  995 non-null    object 
 4   F2010     995 non-null    float64
 5   F2011     995 non-null    float64
 6   F2012     995 non-null    float64
 7   F2013     995 non-null    float64
 8   F2014     995 non-null    float64
 9   F2015     995 non-null    float64
 10  F2016     995 non-null    float64
 11  F2017     995 non-null    float64
 12  F2018     995 non-null    float64
 13  F2019     995 non-null    float64
 14  F2020     995 non-null    float64
 15  F2021     995 non-null    float64
 16  F2022     995 non-null    float64
dtypes: float64(13), object(4)
memory usage: 132.3+ KB


In [4]:
#### Model 1 : Linear Regression (from scratch)

In [5]:
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def r2_score(y_true, y_pred):
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_residual / ss_total)

class LinearRegressionScratch:
    def __init__(self):
        self.theta = None  
    
    def fit(self, X, y):
        # Add bias term (column of ones)
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        
        # Normal Equation: theta = (X'X)^(-1) X'y
        self.theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
    
    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]  
        return X_b.dot(self.theta)



X = df[['F2011', 'F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019', 'F2020', 'F2021', 'F2022']].values  
y = df['F2010'].values


split_idx = int(0.8 * len(X))  # 80% train, 20% test
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]


lr = LinearRegressionScratch()
lr.fit(X_train, y_train)


y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print(y_pred_test)


print("\nLinear Regression from Scratch:")
print("Train MSE:", mean_squared_error(y_train, y_pred_train))
print("Test MSE:", mean_squared_error(y_test, y_pred_test))
print("Train R-squared:", r2_score(y_train, y_pred_train))
print("Test R-squared:", r2_score(y_test, y_pred_test))


[7.60165113e+00 1.49147151e+00 2.48041412e+02 1.73018987e+01
 2.74510046e+02 5.60365792e+00 3.98222846e+00 1.24835998e+01
 1.60751893e-01 2.02891578e+01 7.73375353e+00 3.30203813e-01
 5.38805299e+01 1.62831524e+01 7.16799051e+01 5.51468321e-01
 1.38390573e+00 2.42440149e+02 5.54162886e+00 2.55469438e+02
 4.87554954e+00 3.03126207e+00 4.18535262e+01 1.43865982e+02
 4.61496634e+01 1.10765396e+02 9.01727324e-01 1.12816048e+02
 3.54175408e-01 1.21390089e+00 1.00555209e+01 3.62518481e-01
 6.39457722e+01 4.99036145e+01 4.04326962e+00 4.77264017e+00
 5.83839724e+02 4.21326683e+02 1.58019835e+02 8.81412863e+00
 9.03477669e+00 2.73647647e-01 2.26434586e-01 2.75794288e+02
 2.83907788e+02 2.80852695e-01 8.11208143e+00 1.77019604e+02
 1.90449700e+01 2.09773416e+02 3.63229423e+00 1.04956989e+01
 7.35774095e+01 4.07012070e+02 3.33316097e+02 3.97997498e-01
 8.21010184e+00 1.64289867e+00 1.93513982e+01 8.81113650e+00
 1.10641244e+00 9.04946823e+01 1.60212242e+02 6.06323778e+01
 9.36461618e+00 7.452917

In [6]:
### Model 2 : Random Forest Regression

In [7]:

X = df[['F2011', 'F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019', 'F2020', 'F2021', 'F2022']].values  
y = df['F2010'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)


y_pred_train = rf_regressor.predict(X_train)
y_pred_test = rf_regressor.predict(X_test)

print(y_pred_test)


print("\nRandom Forest Regression:")
print("Train MSE:", mean_squared_error(y_train, y_pred_train))
print("Test MSE:", mean_squared_error(y_test, y_pred_test))
print("Train R-squared:", r2_score(y_train, y_pred_train))
print("Test R-squared:", r2_score(y_test, y_pred_test))

[3.85851025e+01 8.58772071e+01 1.93145436e+02 2.70389074e+01
 7.34538613e+01 2.58152242e+02 9.81474892e+01 2.57399901e+01
 9.72374546e+00 1.13965209e+00 7.06933434e+00 2.72222120e+01
 3.48921266e+00 2.42761514e+02 5.82680422e+02 2.12109522e+01
 3.76753605e+02 1.70310669e-02 2.29224285e+01 2.84744666e+02
 1.65981406e+02 2.23387105e+00 1.43630506e+02 8.69557847e+00
 2.59978496e+00 1.82236771e+01 2.30157186e+02 1.95687718e+01
 2.50642789e-01 4.29352192e-03 1.76145923e-01 1.74740865e-01
 2.49111355e-01 5.42370828e+00 1.56062493e+02 4.17111388e-01
 2.61183385e+02 9.53903562e-01 7.52241301e+02 1.62768904e-01
 8.99840988e+00 3.33918105e+02 2.13475844e+02 1.12315486e+01
 4.29169877e+00 2.03504084e+01 2.05645858e+01 4.94560586e+00
 1.09895122e+02 1.29354573e+01 5.82228494e+00 1.79081918e-01
 7.15294412e+01 3.34987599e+00 1.30386503e+02 3.34613168e+01
 9.62440824e-01 1.77794875e-02 1.46905367e+02 1.52176491e-02
 3.72507810e+02 3.53950055e+00 2.46473441e+01 1.90511363e+01
 5.49553607e+00 2.766141

In [8]:
### Model 3 : Ridge Regression

In [9]:

X = df[['F2011', 'F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019', 'F2020', 'F2021', 'F2022']].values  
y = df['F2010'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(X_train, y_train)


y_pred_train = ridge_regressor.predict(X_train)
y_pred_test = ridge_regressor.predict(X_test)

print(y_pred_test)
print("\nRidge Regression:")
print("Train MSE:", mean_squared_error(y_train, y_pred_train))
print("Test MSE:", mean_squared_error(y_test, y_pred_test))
print("Train R-squared:", r2_score(y_train, y_pred_train))
print("Test R-squared:", r2_score(y_test, y_pred_test))


[3.92202059e+01 8.04352980e+01 1.96850744e+02 2.69031640e+01
 7.72286889e+01 2.44936415e+02 9.06447020e+01 2.44662510e+01
 9.99776462e+00 1.46579702e+00 7.33760855e+00 2.71252272e+01
 3.79720740e+00 2.47060416e+02 5.82420813e+02 2.16591021e+01
 3.70652910e+02 3.19944065e-01 2.23534775e+01 2.95122721e+02
 1.60348651e+02 2.51303816e+00 1.42050827e+02 9.36849380e+00
 2.93127074e+00 1.89247739e+01 2.32933331e+02 1.93511491e+01
 5.34867482e-01 3.08405692e-01 4.85320339e-01 4.86086822e-01
 5.55095461e-01 5.04905481e+00 1.60574816e+02 7.10877824e-01
 2.66787408e+02 1.25934616e+00 7.44704611e+02 4.63235118e-01
 9.32380133e+00 3.56754179e+02 2.14889404e+02 1.17460618e+01
 4.55339521e+00 2.11870594e+01 2.17275982e+01 5.24782053e+00
 1.11869047e+02 1.30997007e+01 5.63949588e+00 5.03760025e-01
 7.15543477e+01 3.64591453e+00 1.28092800e+02 3.28529368e+01
 1.26513418e+00 3.27025033e-01 1.44268087e+02 3.20117582e-01
 3.82147007e+02 3.76161597e+00 2.42661240e+01 1.98548363e+01
 5.93095496e+00 3.015103

In [12]:
### Hyper-parameter optimization 

In [13]:
# Task 4: Hyperparameter Optimization with Cross-Validation
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30]
}

grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, scoring='r2', cv=5, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Hyperparameters for Random Forest:", grid_search.best_params_)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Hyperparameters for Random Forest: {'max_depth': 20, 'n_estimators': 50}


In [14]:
### Feature Selection

In [18]:
# Prepare X and y as per your dataset
X = df[['F2011', 'F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019', 'F2020', 'F2021', 'F2022']].values
y = df['F2010'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SelectKBest for feature selection using f_regression
feature_selector = SelectKBest(score_func=f_regression, k=5)

# Fit the model to the training data and transform the training set
X_train_selected = feature_selector.fit_transform(X_train, y_train)

# Transform the test data using the fitted selector
X_test_selected = feature_selector.transform(X_test)

# Get the indices of the selected features
selected_features_indices = feature_selector.get_support(indices=True)

# Get the names of the selected features
selected_features = ['F2011', 'F2012', 'F2013', 'F2014', 'F2015', 'F2016', 'F2017', 'F2018', 'F2019', 'F2020', 'F2021', 'F2022']
selected_features = [selected_features[i] for i in selected_features_indices]

# Print the selected features
print("Selected Features:", selected_features)


Selected Features: ['F2011', 'F2012', 'F2013', 'F2014', 'F2015']


In [19]:
rf_final_model = RandomForestRegressor(
    n_estimators=grid_search.best_params_['n_estimators'],
    max_depth=grid_search.best_params_['max_depth'],
    random_state=42
)

rf_final_model.fit(X_train_selected, y_train)
final_pred = rf_final_model.predict(X_test_selected)

print("Final Model Performance:")
print("Test MSE:", mean_squared_error(y_test, final_pred))
print("Test R-squared:", r2_score(y_test, final_pred))

Final Model Performance:
Test MSE: 51.33572507516754
Test R-squared: 0.9978156517888755
