In [332]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt
from math import pow
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore') # Ignoring warnings to keep notebook clean and concise
random_state = 0 # Setting a random state variable to keep all results consistent

In [None]:
gfcf = pd.read_csv('Transformed Data/GFCF.csv')

gfcf['Year'] = pd.to_datetime(gfcf['Year'], format='%Y')

gfcf.set_index('Year', inplace=True)

In [None]:
def arima_model(df, order):
    model = ARIMA(df, order=order)
    model_fit = model.fit(disp=False)
    return model_fit

In [None]:
def arima_grid_search(df, p_values, d_values, q_values):
    best_score, best_order = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                try:
                    model = arima_model(df, (p,d,q))
                    mse = mean_squared_error(df, model.fittedvalues)
                    rmse = sqrt(mse)
                    if rmse < best_score:
                        best_score, best_order = rmse, (p,d,q)
                except:
                    continue
    return best_score, best_order

In [None]:
def preprocess_data(df):
    return np.log(df)

In [None]:
p_values = [0, 1, 2, 3]
d_values = [0, 1, 2, 3]
q_values = [0, 1, 2, 3]

# Perform grid search for each country
countries = gfcf['Country'].unique()
for country in countries:
    print(f'Country: {country}')
    df_country = preprocess_data(gfcf[gfcf['Country'] == country]['Construction Value'])
    print(f'Datapoints:{len(df_country)}')
    best_score, best_order = arima_grid_search(df_country, p_values, d_values, q_values)
    print(f'Best ARIMA{best_order} RMSE={best_score}\n')

ARIMA does not seem to be suitable as a prediction model, it seems to not find good values for p,d and q for Arima. This could be due to small datasize, lack of stationality etc.

In [None]:
gfcf_gdp = pd.read_csv('Transformed Data/GFCF-ML.csv')

gfcf_gdp['Year'] = pd.to_datetime(gfcf_gdp['Year'], format='%Y')

data = gfcf_gdp.dropna(axis=0,how='any') # drop empty rows

data.head()


In [None]:
countries = data['Country'].unique()
country_data = {country: data[data['Country'] == country] for country in countries}

In [None]:
model_lr = LinearRegression()
rmse_scores = 0

for country in countries:
    # Convert datetime to year (integer)
    X = pd.DataFrame(country_data[country]['Year'].dt.year)
    y = country_data[country]['Percent GDP']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model_lr.fit(X_train, y_train)
    predictions = model_lr.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    rmse_scores += sqrt(mse)


print(f'Average RMSE : {rmse_scores/len(countries)}')
print(f'Average MSE : {pow((rmse_scores/len(countries)),2)}')


In [None]:
data['Year'] = pd.DataFrame(data['Year'].dt.year)

X = data[['Year', 'Country']]
y = data['Percent GDP']

# Convert 'Country' to numerical value
X = pd.get_dummies(X)

# Define models and hyper paramters
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [100, 200, 300]}),
    'Gradient Boosting': (GradientBoostingRegressor(), {'learning_rate': [0.1, 0.05, 0.01]})
}

# Perform GridSearchCV for each model and print the best parameters and score
for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error',error_score='raise')
    grid_search.fit(X, y)

    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best score for {name}: {-grid_search.best_score_}\n")