In [1]:
# Import pandas and numpy
import pandas as pd 
import numpy as np

# Import for splitting data into train & test for the ML models
from sklearn.model_selection import train_test_split

# Import Machine Learning Linear regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score

# Import neural network modules
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Import standard scaler for scaling the data
from sklearn.preprocessing import StandardScaler

# Import metrics to calculate accuracy of models
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read the cleaned country data CSV file
df = pd.read_csv("merged_countries.csv", encoding ="unicode_escape")
df.head()

Unnamed: 0,country_name,Year,urbanization,gdp_per_cap,inflation,gdp_current,unemployment,literacy_adult,health_exp,literacy_youth,life_exp,re_consumption,ff_consumption,energy_use
0,Canada,1971,76.09,4520.162878,2.704918,99271960000.0,6.4,3042833000.0,3042833000.0,3042833000.0,73.029268,3042833000.0,84.15415,6436.226256
1,Algeria,1971,39.665,359.824582,2.626642,5077222000.0,3042833000.0,3042833000.0,3042833000.0,3042833000.0,43.67,3042833000.0,98.99982,245.527602
2,Australia,1971,84.16,3494.97331,6.138107,45214470000.0,3042833000.0,3042833000.0,3042833000.0,3042833000.0,71.068293,3042833000.0,91.17778,3989.629744
3,Austria,1971,65.301,2380.978458,4.704258,17858490000.0,1.5,3042833000.0,3042833000.0,3042833000.0,70.114634,3042833000.0,90.19969,2508.520919
4,Belgium,1971,93.976,3082.927989,4.343629,29821660000.0,2.1,3042833000.0,3042833000.0,3042833000.0,71.060488,3042833000.0,3042833000.0,4099.95532


In [3]:
# Checking datatypes of each columns
df.dtypes

country_name       object
Year                int64
urbanization      float64
gdp_per_cap       float64
inflation         float64
gdp_current       float64
unemployment      float64
literacy_adult    float64
health_exp        float64
literacy_youth    float64
life_exp          float64
re_consumption    float64
ff_consumption    float64
energy_use        float64
dtype: object

In [4]:
world_df = df.copy()
world_df.head()

Unnamed: 0,country_name,Year,urbanization,gdp_per_cap,inflation,gdp_current,unemployment,literacy_adult,health_exp,literacy_youth,life_exp,re_consumption,ff_consumption,energy_use
0,Canada,1971,76.09,4520.162878,2.704918,99271960000.0,6.4,3042833000.0,3042833000.0,3042833000.0,73.029268,3042833000.0,84.15415,6436.226256
1,Algeria,1971,39.665,359.824582,2.626642,5077222000.0,3042833000.0,3042833000.0,3042833000.0,3042833000.0,43.67,3042833000.0,98.99982,245.527602
2,Australia,1971,84.16,3494.97331,6.138107,45214470000.0,3042833000.0,3042833000.0,3042833000.0,3042833000.0,71.068293,3042833000.0,91.17778,3989.629744
3,Austria,1971,65.301,2380.978458,4.704258,17858490000.0,1.5,3042833000.0,3042833000.0,3042833000.0,70.114634,3042833000.0,90.19969,2508.520919
4,Belgium,1971,93.976,3082.927989,4.343629,29821660000.0,2.1,3042833000.0,3042833000.0,3042833000.0,71.060488,3042833000.0,3042833000.0,4099.95532


### Linear Regression model

In [5]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "Year","country_name", "health_exp", "literacy_youth", "literacy_adult"], axis=1)
y = world_df["gdp_current"]

In [6]:
# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
# Create the linear regression model
lr_model = LinearRegression()

# Fit the model 
lr_model.fit(X_train, y_train)

LinearRegression()

To assess the impact of each features on the target, I am using the coefficients of linear regression model.

In [8]:
# Determine feature coefficients
coefficients = lr_model.coef_

# Sort the coefficients in descending absolute value order
sorted_index = np.abs(coefficients).argsort()[::-1]

# Print the feature names and their coefficients
print("Feature Coefficients:")
for index in sorted_index:
    print("{}: {}".format(X.columns[index], coefficients[index]))

Feature Coefficients:
life_exp: 1023460992.0554459
urbanization: 219956422.90306938
inflation: -12140433.397974197
energy_use: -4739031.9280726
gdp_per_cap: 2469212.5586326765
ff_consumption: -1.288654284318909
unemployment: -0.3147877766750753
re_consumption: 0.22062200639629737


In [9]:
# Make predictions on the test data set
y_pred = lr_model.predict(X_test)

# Calculate the accuracy of the model
score = lr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 14.48%
R2 Score: 14.48%


### Linear Regression model + scaling on our data

In [5]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "country_name", "energy_use", "unemployment", "ff_consumption", "literacy_adult", "inflation", "literacy_youth"], axis=1)
y = world_df["gdp_current"]

In [6]:
# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [8]:
# Fit and transform the X_train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the X_test data using scaler object
X_test_scaled = scaler.transform(X_test)

In [9]:
# Create the linear regression model
lr_model_new = LinearRegression()

# Fit the model on scaled data
lr_model_new.fit(X_train_scaled, y_train)

LinearRegression()

In [10]:
# Determine feature coefficients
coefficients = lr_model_new.coef_

# Sort the coefficients in descending absolute value order
sorted_index = np.abs(coefficients).argsort()[::-1]

# Print the feature names and their coefficients
print("Feature Coefficients:")
for index in sorted_index:
    print("{}: {}".format(X.columns[index], coefficients[index]))

Feature Coefficients:
gdp_per_cap: 12220186866.124237
life_exp: 9380498644.570835
Year: 4163543087.5723524
re_consumption: 3492158393.25698
urbanization: 2215601202.8081536
health_exp: 545561617.2488182


In [11]:
# Make predictions on the test data set
y_pred = lr_model_new.predict(X_test_scaled)

# Calculate the accuracy of the model
score = lr_model_new.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 18.69%
R2 Score: 18.69%


### Random Forest Regressor model

In [28]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "country_name", "Year", "gdp_per_cap"], axis=1)
y = world_df["gdp_current"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [29]:
# Create the random forest regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, random_state=42)

# Fit the model
rfr_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [30]:
# Determine feature importances
importances = rfr_model.feature_importances_

# Sort the feature importances in descending order (highest to lowest)
sorted_index = importances.argsort()[::-1]

# Print the feature names and their importances
print("Feature Importances:")
for index in sorted_index:
    print("{}: {}".format(X.columns[index], importances[index]))

Feature Importances:
energy_use: 0.22651587302810583
life_exp: 0.18544050817426436
ff_consumption: 0.1643510410187316
urbanization: 0.1067623753604672
unemployment: 0.09342312489993419
health_exp: 0.07382784244118594
re_consumption: 0.059567585637767
inflation: 0.04066362058052334
literacy_adult: 0.026008230274815743
literacy_youth: 0.02343979858420479


In [31]:
# Make predictions on the test data set
y_pred = rfr_model.predict(X_test)

# Calculate the accuracy of the RandomForestRegressor model
score = rfr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 79.95%
R2 Score: 79.95%


### Random Forest Regressor + Scaling data

In [21]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "country_name", "gdp_per_cap"], axis=1)
y = world_df["gdp_current"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the X_train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the X_test data using scaler object
X_test_scaled = scaler.transform(X_test)

In [23]:
# Create the random forest regressor model
rfr_model_scale = RandomForestRegressor(n_estimators=1000, random_state=42)

# Fit the model
rfr_model_scale.fit(X_train_scaled, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [24]:
# Determine feature importances
importances = rfr_model_scale.feature_importances_

# Sort the feature importances in descending order (highest to lowest)
sorted_index = importances.argsort()[::-1]

# Print the feature names and their importances
print("Feature Importances:")
for index in sorted_index:
    print("{}: {}".format(X.columns[index], importances[index]))

Feature Importances:
energy_use: 0.31642689435764065
urbanization: 0.2149266443246443
life_exp: 0.20577854606981946
inflation: 0.08429713687972344
Year: 0.06064697803882027
unemployment: 0.03284434878258192
literacy_adult: 0.025629960882013823
health_exp: 0.02082098388266322
literacy_youth: 0.02050243098540341
re_consumption: 0.017151803082075177
ff_consumption: 0.0009742727146143504


In [25]:
# Make predictions on the test data set
y_pred = rfr_model.predict(X_test_scaled)

# Calculate the accuracy of the RandomForestRegressor model
score = rfr_model.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: -19.0%
R2 Score: -19.0%


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


### Support Vector Regressor model

In [26]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "country_name", "gdp_per_cap"], axis=1)
y = world_df["gdp_current"]

In [27]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [28]:
# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [29]:
# Create the SVR model
SVR_model = SVR(kernel='linear')

# Fir the model on scaled data
SVR_model.fit(X_train, y_train)

SVR(kernel='linear')

One of the ways to measure feature importances of SVR model is Recursive Feature Elimination (RFE).
RFE selects the most important features recursively and the contribution of each feature to overall accuracy of the model is measured.

In [31]:
from sklearn.feature_selection import RFE

# Use RFE to select the best features
rfe = RFE(SVR_model, n_features_to_select=1)
rfe.fit(X_train, y_train)

# Convert the ranking_ attribute to a pandas dataframe
importance_df = pd.DataFrame({"feature": X.columns, "importance": rfe.ranking_})

# Sort the dataframe in descending order of importance
sorted_importance_df = importance_df.sort_values(by="importance", ascending=False)

# Print the sorted feature importances
print("Sorted Feature Importances:")
for index, row in sorted_importance_df.iterrows():
    print("{}: {}".format(row["feature"], row["importance"]))

# Evaluate the model on the test set
test_score = rfe.score(X_test, y_test)

print("Test Score: {}".format(test_score))

Sorted Feature Importances:
ff_consumption: 11
re_consumption: 10
health_exp: 9
inflation: 8
literacy_adult: 7
literacy_youth: 6
energy_use: 5
unemployment: 4
Year: 3
urbanization: 2
life_exp: 1
Test Score: -0.226061256387301


The test score tells us the accuracy at which rfe was able to predict the most important features during feature selection.

Due to its accuracy being negative, I can conclude that the rfe model was not able to select the most important features. 

In [32]:
# Make predictions
y_pred = SVR_model.predict(X_test)

# Calculate the accuracy of this SVR model
test_accuracy = SVR_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(test_accuracy * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: -22.61%
R2 Score: -22.61%


kernal = poly

In [33]:
# Create the SVR model
SVR_model_poly = SVR(kernel='poly')

# Fir the model on scaled data
SVR_model_poly.fit(X_train, y_train)

SVR(kernel='poly')

In [34]:
# Make predictions
y_pred = SVR_model_poly.predict(X_test)

# Calculate the accuracy of this SVR model
test_accuracy = SVR_model_poly.score(X_test, y_test)
print(f"Test accuracy: {np.round(test_accuracy * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: -22.61%
R2 Score: -22.61%


kernal = rbf

In [35]:
# Create the SVR model
SVR_model_poly = SVR(kernel='rbf')

# Fir the model on scaled data
SVR_model_poly.fit(X_train, y_train)

SVR()

In [36]:
# Make predictions
y_pred = SVR_model_poly.predict(X_test)

# Calculate the accuracy of this SVR model
test_accuracy = SVR_model_poly.score(X_train, y_train)
print(f"Test accuracy: {np.round(test_accuracy * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: -21.43%
R2 Score: -22.61%


Note: Despite changing the kernal function, all three different kernal functions give us the same model accuracy.

Potential reasoning behind negative accuracy value:
- There might still be extreme outliars in target values.

Potential solve for the negative accuracy value:
- Using different evaluation metrics such as instead of using R2 square, and accuracy score of model, we can use MSE (Mean Squared Error) which measure for us the average of squared differences between predicted and actual values. Lower MSE provides more accuracte prediction.

In [37]:
# Calculating the MSE of above SVR model
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)

Mean Squared Error: 3.388634203902444e+21
