In [1]:
# Import pandas and numpy
import pandas as pd 
import numpy as np

# Import for splitting data into train & test for the ML models
from sklearn.model_selection import train_test_split

# Import Machine Learning Linear regression models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Import standard scaler for scaling the data
from sklearn.preprocessing import StandardScaler

# Import metrics to calculate accuracy of models
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read the cleaned country data CSV file
df = pd.read_csv("cleaned_country_data.csv", encoding ="unicode_escape")
df.head()

Unnamed: 0,country_name,Year,gdp_per_cap,gdp_current,inflation,unemployment,cpi,health_exp,gdp_growth
0,Canada,1964,2555.111146,49377520000.0,1.912145,3460337000.0,14.109903,3460337000.0,6.642894
1,Australia,1964,2131.277657,23799980000.0,2.866242,3460337000.0,8.402706,3460337000.0,6.980061
2,Austria,1964,1269.412583,9169984000.0,3.868564,3460337000.0,20.551485,3460337000.0,6.124354
3,Belgium,1964,1701.846276,15960110000.0,4.168761,3460337000.0,16.995363,3460337000.0,6.956685
4,Bolivia,1964,133.886394,539491500.0,10.181818,3460337000.0,2.8e-05,3460337000.0,3.956597


In [3]:
# Viewing the size of the data (rows, columns)
print(df.shape)

# Checking datatypes of each columns
df.dtypes

(5446, 9)


country_name     object
Year              int64
gdp_per_cap     float64
gdp_current     float64
inflation       float64
unemployment    float64
cpi             float64
health_exp      float64
gdp_growth      float64
dtype: object

## Linear Regression

### Linear Regression on Canada's dataset

In [4]:
# Displaying Canada's data
Canada_df = df.loc[df.country_name == "Canada"]
Canada_df = Canada_df.reset_index(drop=True)
Canada_df.head(10)

Unnamed: 0,country_name,Year,gdp_per_cap,gdp_current,inflation,unemployment,cpi,health_exp,gdp_growth
0,Canada,1964,2555.111146,49377520000.0,1.912145,3460337000.0,14.109903,3460337000.0,6.642894
1,Canada,1965,2770.361804,54515180000.0,2.332657,3577787000.0,14.439038,3577787000.0,6.316714
2,Canada,1966,3047.106147,61088380000.0,3.815659,3744663000.0,14.989983,3744663000.0,6.704992
3,Canada,1967,3217.159294,65668660000.0,3.579952,3878820000.0,15.526617,3878820000.0,3.091206
4,Canada,1968,3462.678872,71829810000.0,4.0553,4144370000.0,16.156268,4144370000.0,4.995667
5,Canada,1969,3763.953379,79148410000.0,4.561559,4.7,16.893246,4422345000.0,5.045456


In [5]:
# Setting features dataframe and target vector
X = Canada_df.drop(["gdp_growth", "country_name"], axis=1)
y = Canada_df["gdp_growth"]

# Split the data into a training and test set. Only testing 20% of the data and 80% for training. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
# Create the linear regression model
lr_model = LinearRegression()

# Fit the model 
lr_model.fit(X_train, y_train)

LinearRegression()

In [7]:
# Make predictions on the test data set
y_pred = lr_model.predict(X_test)

# Calculate the accuracy of the model
score = lr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: -5783.84%
R2 Score: -5783.84%


### Linear Regression on all countries' dataset

In [10]:
world_df = df.copy()
world_df.head()

Unnamed: 0,country_name,Year,gdp_per_cap,gdp_current,inflation,unemployment,cpi,health_exp,gdp_growth
0,Canada,1964,2555.111146,49377520000.0,1.912145,3460337000.0,14.109903,3460337000.0,6.642894
1,Australia,1964,2131.277657,23799980000.0,2.866242,3460337000.0,8.402706,3460337000.0,6.980061
2,Austria,1964,1269.412583,9169984000.0,3.868564,3460337000.0,20.551485,3460337000.0,6.124354
3,Belgium,1964,1701.846276,15960110000.0,4.168761,3460337000.0,16.995363,3460337000.0,6.956685
4,Bolivia,1964,133.886394,539491500.0,10.181818,3460337000.0,2.8e-05,3460337000.0,3.956597


In [11]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_growth", "country_name"], axis=1)
y = world_df["gdp_growth"]

In [12]:
# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
# Create the linear regression model
lr_model = LinearRegression()

# Fit the model 
lr_model.fit(X_train, y_train)

LinearRegression()

In [14]:
# Make predictions on the test data set
y_pred = lr_model.predict(X_test)

# Calculate the accuracy of the model
score = lr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 2.07%
R2 Score: 2.07%


### Linear Regression on all countries' dataset + scaling data

In [15]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_growth", "country_name"], axis=1)
y = world_df["gdp_growth"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [17]:
# Fit and transform the X_train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the X_test data using scaler object
X_test_scaled = scaler.transform(X_test)

In [18]:
# Create the linear regression model
lr_model = LinearRegression()

# Fit the model on scaled data
lr_model.fit(X_train_scaled, y_train)

LinearRegression()

In [19]:
# Make predictions on the test data set
y_pred = lr_model.predict(X_test_scaled)

# Calculate the accuracy of the model
score = lr_model.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 2.03%
R2 Score: 2.03%


## Random Forest Regressor

### Random Forest Regressor on Costa Rica's data

In [24]:
# Displaying Costa Rica's data
CostaRica_df = df.loc[df.country_name == "Costa Rica"]
CostaRica_df = CostaRica_df.reset_index(drop=True)
CostaRica_df

Unnamed: 0,country_name,Year,gdp_per_cap,gdp_current,inflation,unemployment,cpi,health_exp,gdp_growth
0,Costa Rica,1964,349.901344,542578400.0,3.324033,3460337000.0,0.27872,3460337000.0,3.659048
1,Costa Rica,1965,369.98056,592981200.0,-0.665,3577787000.0,0.276867,3577787000.0,8.190528
2,Costa Rica,1966,391.211288,647305600.0,0.182883,3744663000.0,0.277373,3744663000.0,6.969319
3,Costa Rica,1967,409.983118,699456600.0,1.20834,3878820000.0,0.280725,3878820000.0,5.650126
4,Costa Rica,1968,440.600669,773841500.0,4.093891,4144370000.0,0.292217,4144370000.0,8.473304
5,Costa Rica,1969,472.671832,853630200.0,2.629341,4422345000.0,0.299901,4422345000.0,5.4918
6,Costa Rica,1970,530.706338,984830200.0,4.652297,4995407000.0,0.313853,4995407000.0,7.503134
7,Costa Rica,1971,565.290379,1077153000.0,3.083048,5278676000.0,0.323529,5278676000.0,6.778505
8,Costa Rica,1972,633.19966,1238252000.0,4.601126,6018426000.0,0.338415,6018426000.0,8.178045
9,Costa Rica,1973,762.077743,1528916000.0,15.214034,7262812000.0,0.389902,7262812000.0,7.708916


In [25]:
# Setting features dataframe and target vector
X = CostaRica_df.drop(["gdp_growth", "country_name"], axis=1)
y = CostaRica_df["gdp_growth"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
# Create the random forest regressor model
rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
rfr_model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [27]:
# Make predictions on the test data set
y_pred = rfr_model.predict(X_test)

# Calculate the accuracy of the RandomForestRegressor model
score = rfr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 19.73%
R2 Score: 19.73%


### Random Forest Regressor on all countries' dataset

In [28]:
world_df.head()

Unnamed: 0,country_name,Year,gdp_per_cap,gdp_current,inflation,unemployment,cpi,health_exp,gdp_growth
0,Canada,1964,2555.111146,49377520000.0,1.912145,3460337000.0,14.109903,3460337000.0,6.642894
1,Australia,1964,2131.277657,23799980000.0,2.866242,3460337000.0,8.402706,3460337000.0,6.980061
2,Austria,1964,1269.412583,9169984000.0,3.868564,3460337000.0,20.551485,3460337000.0,6.124354
3,Belgium,1964,1701.846276,15960110000.0,4.168761,3460337000.0,16.995363,3460337000.0,6.956685
4,Bolivia,1964,133.886394,539491500.0,10.181818,3460337000.0,2.8e-05,3460337000.0,3.956597


In [29]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_growth", "country_name"], axis=1)
y = world_df["gdp_growth"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [30]:
# Create the random forest regressor model
rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
rfr_model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [31]:
# Make predictions on the test data set
y_pred = rfr_model.predict(X_test)

# Calculate the accuracy of the RandomForestRegressor model
score = rfr_model.score(X_test, y_test)
print(f"Test accuracy: {np.round(score * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 15.51%
R2 Score: 15.51%


### Random Forest Regressor on all countries' dataset + scaling data

In [38]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_growth", "country_name"], axis=1)
y = world_df["gdp_growth"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [39]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [40]:
# Fit and transform the X_train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the X_test data using scaler object
X_test_scaled = scaler.transform(X_test)

In [41]:
# Create the random forest regressor model
rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on scaled data
rfr_model.fit(X_train_scaled, y_train)

RandomForestRegressor(random_state=42)

In [43]:
# Make predictions on the test data set
y_pred = rfr_model.predict(X_train_scaled)

# Calculate the accuracy of this RandomForestRegressor model
test_accuracy = rfr_model.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(test_accuracy * 100, 2)}%")

Test accuracy: 16.76%


### Support Vector Regressor model on all countries' dataset + scaling

In [44]:
# Setting features dataframe and target vector
X = world_df.drop(["gdp_growth", "country_name"], axis=1)
y = world_df["gdp_growth"]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Initialize the StandardScaler
scaler = StandardScaler()

In [46]:
# Fit and transform the X_train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the X_test data using scaler object
X_test_scaled = scaler.transform(X_test)

In [47]:
# Create the SVR model
SVR_model = SVR(kernel='linear')

# Fir the model on scaled data
SVR_model.fit(X_train_scaled, y_train)

SVR(kernel='linear')

In [48]:
# Make predictions
y_pred = SVR_model.predict(X_test_scaled)

# Calculate the accuracy of this SVR model
test_accuracy = SVR_model.score(X_test_scaled, y_test)
print(f"Test accuracy: {np.round(test_accuracy * 100, 2)}%")

# Calculate the r2_score
r2_score_ = r2_score(y_test,y_pred)
print(f"R2 Score: {np.round(r2_score_ * 100, 2)}%")

Test accuracy: 1.75%
R2 Score: 1.75%


In [49]:
# Calculate the mean squared error (mse)
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error:", mse)

Mean Squared Error: 14.058177922223226


Lower this number, the better. In our case, its very high. 