In [1]:
# Import pandas and numpy
import pandas as pd 
import numpy as np

# Import for splitting data into train & test for the ML models
from sklearn.model_selection import train_test_split

# Import Machine Learning Linear regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Import standard scaler for scaling the data
from sklearn.preprocessing import StandardScaler

# Import metrics to calculate accuracy of models
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read the cleaned country data CSV file
df = pd.read_csv("merged_countries.csv", encoding ="unicode_escape")
df.head()

Unnamed: 0,country_name,Year,urbanization,gdp_per_cap,inflation,gdp_current,unemployment,literacy_adult,health_exp,literacy_youth,life_exp,re_consumption,ff_consumption,energy_use
0,Canada,1971,76.09,4520.162878,2.704918,99271960000.0,6.4,3042833000.0,3042833000.0,3042833000.0,73.029268,3042833000.0,84.15415,6436.226256
1,Algeria,1971,39.665,359.824582,2.626642,5077222000.0,3042833000.0,3042833000.0,3042833000.0,3042833000.0,43.67,3042833000.0,98.99982,245.527602
2,Australia,1971,84.16,3494.97331,6.138107,45214470000.0,3042833000.0,3042833000.0,3042833000.0,3042833000.0,71.068293,3042833000.0,91.17778,3989.629744
3,Austria,1971,65.301,2380.978458,4.704258,17858490000.0,1.5,3042833000.0,3042833000.0,3042833000.0,70.114634,3042833000.0,90.19969,2508.520919
4,Belgium,1971,93.976,3082.927989,4.343629,29821660000.0,2.1,3042833000.0,3042833000.0,3042833000.0,71.060488,3042833000.0,3042833000.0,4099.95532


In [3]:
# Checking datatypes of each columns
df.dtypes

country_name       object
Year                int64
urbanization      float64
gdp_per_cap       float64
inflation         float64
gdp_current       float64
unemployment      float64
literacy_adult    float64
health_exp        float64
literacy_youth    float64
life_exp          float64
re_consumption    float64
ff_consumption    float64
energy_use        float64
dtype: object

In [4]:
world_df = df.copy()
world_df.head()

Unnamed: 0,country_name,Year,urbanization,gdp_per_cap,inflation,gdp_current,unemployment,literacy_adult,health_exp,literacy_youth,life_exp,re_consumption,ff_consumption,energy_use
0,Canada,1971,76.09,4520.162878,2.704918,99271960000.0,6.4,3042833000.0,3042833000.0,3042833000.0,73.029268,3042833000.0,84.15415,6436.226256
1,Algeria,1971,39.665,359.824582,2.626642,5077222000.0,3042833000.0,3042833000.0,3042833000.0,3042833000.0,43.67,3042833000.0,98.99982,245.527602
2,Australia,1971,84.16,3494.97331,6.138107,45214470000.0,3042833000.0,3042833000.0,3042833000.0,3042833000.0,71.068293,3042833000.0,91.17778,3989.629744
3,Austria,1971,65.301,2380.978458,4.704258,17858490000.0,1.5,3042833000.0,3042833000.0,3042833000.0,70.114634,3042833000.0,90.19969,2508.520919
4,Belgium,1971,93.976,3082.927989,4.343629,29821660000.0,2.1,3042833000.0,3042833000.0,3042833000.0,71.060488,3042833000.0,3042833000.0,4099.95532


### Linear Regression model

In [5]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "country_name"], axis=1)
y = world_df["gdp_current"]

In [6]:
# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
# Create the linear regression model
lr_model = LinearRegression()

# Fit the model 
lr_model.fit(X_train, y_train)

LinearRegression()

In [8]:
# Make predictions on the test data set
y_pred = lr_model.predict(X_test)

# Calculate the accuracy of the model
score = lr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 22.72%
R2 Score: 22.72%


### Linear Regression model + scaling on our data

In [9]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "country_name"], axis=1)
y = world_df["gdp_current"]

In [10]:
# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [12]:
# Fit and transform the X_train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the X_test data using scaler object
X_test_scaled = scaler.transform(X_test)

In [13]:
# Create the linear regression model
lr_model_new = LinearRegression()

# Fit the model on scaled data
lr_model_new.fit(X_train_scaled, y_train)

LinearRegression()

In [14]:
# Make predictions on the test data set
y_pred = lr_model_new.predict(X_test_scaled)

# Calculate the accuracy of the model
score = lr_model_new.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 25.01%
R2 Score: 25.01%


### Random Forest Regressor model

In [15]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "country_name", "gdp_per_cap"], axis=1)
y = world_df["gdp_current"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
# Create the random forest regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, random_state=42)

# Fit the model
rfr_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [17]:
# Make predictions on the test data set
y_pred = rfr_model.predict(X_test)

# Calculate the accuracy of the RandomForestRegressor model
score = rfr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 84.36%
R2 Score: 84.36%


### Support Vector Regressor model

In [18]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_current", "country_name", "gdp_per_cap"], axis=1)
y = world_df["gdp_current"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the X_train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the X_test data using scaler object
X_test_scaled = scaler.transform(X_test)

In [20]:
# Create the SVR model
SVR_model = SVR(kernel='linear')

# Fir the model on scaled data
SVR_model.fit(X_train_scaled, y_train)

SVR(kernel='linear')

In [21]:
# Make predictions
y_pred = SVR_model.predict(X_test_scaled)

# Calculate the accuracy of this SVR model
test_accuracy = SVR_model.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(test_accuracy * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: -26.84%
R2 Score: -26.84%


kernal = poly

In [22]:
# Create the SVR model
SVR_model_poly = SVR(kernel='poly')

# Fir the model on scaled data
SVR_model_poly.fit(X_train_scaled, y_train)

SVR(kernel='poly')

In [23]:
# Make predictions
y_pred = SVR_model_poly.predict(X_test_scaled)

# Calculate the accuracy of this SVR model
test_accuracy = SVR_model_poly.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(test_accuracy * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: -26.84%
R2 Score: -26.84%


kernal = rbf

In [24]:
# Create the SVR model
SVR_model_poly = SVR(kernel='rbf')

# Fir the model on scaled data
SVR_model_poly.fit(X_train_scaled, y_train)

SVR()

In [25]:
# Make predictions
y_pred = SVR_model_poly.predict(X_test_scaled)

# Calculate the accuracy of this SVR model
test_accuracy = SVR_model_poly.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(test_accuracy * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: -26.84%
R2 Score: -26.84%
