In [None]:
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
lab3_2 = pd.read_stata('lab3_2.dta')
lab3_2 = lab3_2.dropna()

In [None]:
m1 = smf.ols(formula = "lngdp2 ~ va + rl + rq + gove + ps + cc", data = lab3_2).fit()
print(m1.summary())

In [None]:
lab3_2[["va", "rl", "rq", "gove", "ps", "cc"]].corr().round(3)

In [None]:
X = lab3_2[["va", "rl", "rq", "gove", "ps", "cc"]]
X = add_constant(X)

vif_data = pd.DataFrame({'variables':X.columns[1:], 'VIF':[variance_inflation_factor(X.values, i+1) for i in range(len(X.columns[1:]))]})
print(vif_data)

In [None]:
train, test = train_test_split(lab3_2, test_size = 0.2, random_state = 1)

In [None]:
ridge1 = Ridge(alpha = 1)
ridge1.fit(train[["va", "rl", "rq", "gove", "ps", "cc"]], train["lngdp2"])
ridge1.coef_

In [None]:
y_pred = ridge1.predict(test[["va", "rl", "rq", "gove", "ps", "cc"]])
mean_squared_error(test["lngdp2"], y_pred)

In [None]:
r2_score(test["lngdp2"], y_pred)

In [None]:
alphas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

grid_search = GridSearchCV(ridge1, {'alpha': alphas}, cv = 5)
grid_search.fit(train[["va", "rl", "rq", "gove", "ps", "cc"]], train["lngdp2"])

print("Best Regularization Parameter:", grid_search.best_params_)

In [None]:
ridge2 = Ridge(alpha = 10)
ridge2.fit(train[["va", "rl", "rq", "gove", "ps", "cc"]], train["lngdp2"])
ridge2.coef_

In [None]:
y_pred2 = ridge2.predict(test[["va", "rl", "rq", "gove", "ps", "cc"]])
mean_squared_error(test["lngdp2"], y_pred2)

In [None]:
r2_score(test["lngdp2"], y_pred2)