## Import Packages

In [93]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_squared_error

# Exercise 1 & 2

## Import Data

In [133]:
df = pd.read_csv("../raw/growthdata92_02.csv",index_col=0)
x = df.drop(['iso3','ln_y','growth'], axis=1)
y1 = df['ln_y']
y2 = df['growth']
df_0211 = pd.read_csv("../raw/growthdata02_11.csv",index_col=0)
x_0211 = df_0211.drop(['iso3','ln_y','growth'], axis=1)
y1_0211 = df_0211['ln_y']
y2_0211 = df_0211['growth']

# get train set and test set
x1_train,x1_test,y1_train,y1_test = train_test_split(x,y1,test_size = 0.2)
x2_train,x2_test,y2_train,y2_test = train_test_split(x,y2,test_size = 0.2)

## Subset Selection

In [134]:
reg = linear_model.LinearRegression()

# ln_y
sfs = SequentialFeatureSelector(reg,k_features = (2,x1_train.shape[1]),forward=True, floating=False, cv=10
                               # ,verbose=2         # show the process
                               )
sfs.fit(x1_train,y1_train)
selected_features = list(sfs.k_feature_names_)
print(sfs.k_feature_names_)
print(sfs.k_score_)

# Ex1
x1_selected = x1_train[selected_features]
reg.fit(x1_selected,y1_train)
mean = pd.DataFrame(np.mean(x1_selected)).transpose()
naive_y_pre = reg.predict(mean)
mae = abs(naive_y_pre[0] - np.mean(y1_train))
print('MAE for naive predictor: ' + str(mae))
sink_y_pre = reg.predict(x[selected_features])
sink_rmse = np.sqrt(mean_squared_error(y1, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(sink_rmse))
test_y_pre = reg.predict(x1_test[selected_features])
test_rmse = np.sqrt(mean_squared_error(y1_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

# Ex2
test_y_0211_pre = reg.predict(x_0211[selected_features])
test_rmse_0211 = np.sqrt(mean_squared_error(y1_0211, test_y_0211_pre))
print('Performance on 02-11 dataset: ' + str(test_rmse_0211))

('gvmnt_c', 'ext_bal', 'inf_mort', 'age_dep_young', 'urban', 'parliamentary', 'presidential', 'voice', 'effectiveness')
0.8391666435030196
MAE for naive predictor: 0.0
RMSE for kitchen sink predictor: 0.3197719013920757
Performance on test set: 0.3688318516459661
Performance on 02-11 dataset: 0.36448672851176045


In [135]:
sfs.fit(x2_train,y2_train)
print(sfs.k_feature_names_)
print(sfs.k_score_)

# Ex1
x2_selected = x2_train[selected_features]
reg.fit(x2_selected,y2_train)
mean = pd.DataFrame(np.mean(x2_selected)).transpose()
naive_y_pre = reg.predict(mean)
mae = abs(naive_y_pre[0] - np.mean(y2_train))
print('MAE for naive predictor: ' + str(mae))
sink_y_pre = reg.predict(x[selected_features])
rmse = np.sqrt(mean_squared_error(y2, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
test_y_pre = reg.predict(x2_test[selected_features])
test_rmse = np.sqrt(mean_squared_error(y2_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

# Ex2
test_y_0211_pre = reg.predict(x_0211[selected_features])
test_rmse_0211 = np.sqrt(mean_squared_error(y2_0211, test_y_0211_pre))
print('Performance on 02-11 dataset: ' + str(test_rmse_0211))

('inflation', 'regulation')
0.23406080537112647
MAE for naive predictor: 6.938893903907228e-18
RMSE for kitchen sink predictor: 0.023016278099988886
Performance on test set: 0.024087599661712887
Performance on 02-11 dataset: 0.041263384225615665


## Ridge Regression

In [1]:
alphas = np.logspace(0, 3, 100) 
ridge_scores = []

for alpha in alphas:
    ridge = linear_model.Ridge(alpha=alpha)
    scores = cross_val_score(ridge, x1_train, y1_train, cv=10)
    ridge_scores.append(np.mean(scores))
best_alpha_index = np.argmax(ridge_scores)  
best_alpha = alphas[best_alpha_index]  
best_score = ridge_scores[best_alpha_index] 
print('Best score: ' + str(best_score))
print('Best alpha: ' + str(best_alpha))

# Ex1
ridge = linear_model.Ridge(alpha = best_alpha)
ridge.fit(x1_train, y1_train)
mean = pd.DataFrame(np.mean(x1_train)).transpose()
naive_y_pre = ridge.predict(mean)
mae = abs(naive_y_pre[0] - np.mean(y1_train))
print('MAE for naive predictor: ' + str(mae))
sink_y_pre = ridge.predict(x)
rmse = np.sqrt(mean_squared_error(y1, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
test_y_pre = ridge.predict(x1_test)
test_rmse = np.sqrt(mean_squared_error(y1_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

# Ex2
test_y_0211_pre = ridge.predict(x_0211)
test_rmse_0211 = np.sqrt(mean_squared_error(y1_0211, test_y_0211_pre))
print('Performance on 02-11 dataset: ' + str(test_rmse_0211))

NameError: name 'linear_model' is not defined

In [137]:
ridge_scores = []
for alpha in alphas:
    ridge = linear_model.Ridge(alpha=alpha)
    scores = cross_val_score(ridge, x2_train, y2_train, cv=10)
    ridge_scores.append(np.mean(scores))

best_alpha_index = np.argmax(ridge_scores)  
best_alpha = alphas[best_alpha_index]  
best_score = ridge_scores[best_alpha_index] 
print('Best score: ' + str(best_score))
print('Best alpha: ' + str(best_alpha))

# Ex1
ridge = linear_model.Ridge(alpha = best_alpha)
ridge.fit(x2_train, y2_train)
mean = pd.DataFrame(np.mean(x2_train)).transpose()
naive_y_pre = ridge.predict(mean)
mae = abs(naive_y_pre[0] - np.mean(y2_train))
print('MAE for naive predictor: ' + str(mae))
sink_y_pre = ridge.predict(x)
rmse = np.sqrt(mean_squared_error(y2, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
test_y_pre = ridge.predict(x2_test)
test_rmse = np.sqrt(mean_squared_error(y2_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

# Ex2
test_y_0211_pre = ridge.predict(x_0211)
test_rmse_0211 = np.sqrt(mean_squared_error(y2_0211, test_y_0211_pre))
print('Performance on 02-11 dataset: ' + str(test_rmse_0211))

Best score: 0.14702797414347418
Best alpha: 200.9233002565048
MAE for naive predictor: 3.469446951953614e-18
RMSE for kitchen sink predictor: 0.02349818064266689
Performance on test set: 0.025098406461586974
Performance on 02-11 dataset: 0.031190275082630475


## Principal Components

In [181]:
# PCA
pca = PCA(copy = True)
pca_fit = pca.fit(x1_train)
x1_pca = pca.fit_transform(x1_train)
reg = linear_model.LinearRegression()
pcreg = reg.fit(x1_pca,y1_train)
print(reg.score(x1_pca,y1_train))

# Ex1
x_pca = pca_fit.transform(x)
sink_y_pre = pcreg.predict(x_pca)
rmse = np.sqrt(mean_squared_error(y1, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
x1_test_pca = pca_fit.transform(x1_test)
test_y_pre = pcreg.predict(x1_test_pca)
test_rmse = np.sqrt(mean_squared_error(y1_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

# Ex2
x_0211_pca = pca_fit.transform(x_0211)
test_y_0211_pre = pcreg.predict(x_0211_pca)
test_rmse_0211 = np.sqrt(mean_squared_error(y1_0211, test_y_0211_pre))
print('Performance on 02-11 dataset: ' + str(test_rmse_0211))

0.9128423223766344
RMSE for kitchen sink predictor: 0.31153937801432424
Performance on test set: 0.37905536112634947
Performance on 02-11 dataset: 0.35777856278611614


In [182]:
pca_fit = pca.fit(x2_train)
x2_pca = pca.fit_transform(x2_train)
reg = linear_model.LinearRegression()
pcreg = reg.fit(x2_pca,y2_train)
print(reg.score(x2_pca,y2_train))

# Ex1
x_pca = pca_fit.transform(x)
sink_y_pre = pcreg.predict(x_pca)
rmse = np.sqrt(mean_squared_error(y2, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
x2_test_pca = pca_fit.transform(x2_test)
test_y_pre = pcreg.predict(x2_test_pca)
test_rmse = np.sqrt(mean_squared_error(y2_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

# Ex2
x_0211_pca = pca_fit.transform(x_0211)
test_y_0211_pre = pcreg.predict(x_0211_pca)
test_rmse_0211 = np.sqrt(mean_squared_error(y2_0211, test_y_0211_pre))
print('Performance on 02-11 dataset: ' + str(test_rmse_0211))

0.5177739976408676
RMSE for kitchen sink predictor: 0.02110496580963439
Performance on test set: 0.024420443988734913
Performance on 02-11 dataset: 0.03728999626681858


# Exercise 3

In [184]:
df = pd.read_csv("../raw/growthdata02_11.csv",index_col=0)
x = df.drop(['iso3','ln_y','growth'], axis=1)
y1 = df['ln_y']
y2 = df['growth']

# get train set and test set
x1_train,x1_test,y1_train,y1_test = train_test_split(x,y1,test_size = 0.2)
x2_train,x2_test,y2_train,y2_test = train_test_split(x,y2,test_size = 0.2)

reg = linear_model.LinearRegression()

############################## Subset Selection ###################################
print ("====== Subset Selection ======")
print ("=== ln_y ===")
# ln_y
sfs = SequentialFeatureSelector(reg,k_features = (2,x1_train.shape[1]),forward=True, floating=False, cv=10
                               # ,verbose=2         # show the process
                               )
sfs.fit(x1_train,y1_train)
selected_features = list(sfs.k_feature_names_)
print(sfs.k_feature_names_)
print(sfs.k_score_)
x1_selected = x1_train[selected_features]
reg.fit(x1_selected,y1_train)
mean = pd.DataFrame(np.mean(x1_selected)).transpose()
naive_y_pre = reg.predict(mean)
mae = abs(naive_y_pre[0] - np.mean(y1_train))
print('MAE for naive predictor: ' + str(mae))
sink_y_pre = reg.predict(x[selected_features])
sink_rmse = np.sqrt(mean_squared_error(y1, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(sink_rmse))
test_y_pre = reg.predict(x1_test[selected_features])
test_rmse = np.sqrt(mean_squared_error(y1_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

print ("=== growth ===")
# growth
sfs.fit(x2_train,y2_train)
print(sfs.k_feature_names_)
print(sfs.k_score_)
x2_selected = x2_train[selected_features]
reg.fit(x2_selected,y2_train)
mean = pd.DataFrame(np.mean(x2_selected)).transpose()
naive_y_pre = reg.predict(mean)
mae = abs(naive_y_pre[0] - np.mean(y2_train))
print('MAE for naive predictor: ' + str(mae))
sink_y_pre = reg.predict(x[selected_features])
rmse = np.sqrt(mean_squared_error(y2, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
test_y_pre = reg.predict(x2_test[selected_features])
test_rmse = np.sqrt(mean_squared_error(y2_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

############################## Ridge Regression ###################################
print ("====== Ridge Regression ======")
print ("=== ln_y ===")
alphas = np.logspace(0, 3, 100) 
ridge_scores = []

for alpha in alphas:
    ridge = linear_model.Ridge(alpha=alpha)
    scores = cross_val_score(ridge, x1_train, y1_train, cv=10)
    ridge_scores.append(np.mean(scores))
best_alpha_index = np.argmax(ridge_scores)  
best_alpha = alphas[best_alpha_index]  
best_score = ridge_scores[best_alpha_index] 
print('Best score: ' + str(best_score))
print('Best alpha: ' + str(best_alpha))
ridge = linear_model.Ridge(alpha = best_alpha)
ridge.fit(x1_train, y1_train)
mean = pd.DataFrame(np.mean(x1_train)).transpose()
naive_y_pre = ridge.predict(mean)
mae = abs(naive_y_pre[0] - np.mean(y1_train))
print('MAE for naive predictor: ' + str(mae))
sink_y_pre = ridge.predict(x)
rmse = np.sqrt(mean_squared_error(y1, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
test_y_pre = ridge.predict(x1_test)
test_rmse = np.sqrt(mean_squared_error(y1_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

print ("=== growth ===")
ridge_scores = []
for alpha in alphas:
    ridge = linear_model.Ridge(alpha=alpha)
    scores = cross_val_score(ridge, x2_train, y2_train, cv=10)
    ridge_scores.append(np.mean(scores))
best_alpha_index = np.argmax(ridge_scores)  
best_alpha = alphas[best_alpha_index]  
best_score = ridge_scores[best_alpha_index] 
print('Best score: ' + str(best_score))
print('Best alpha: ' + str(best_alpha))
ridge = linear_model.Ridge(alpha = best_alpha)
ridge.fit(x2_train, y2_train)
mean = pd.DataFrame(np.mean(x2_train)).transpose()
naive_y_pre = ridge.predict(mean)
mae = abs(naive_y_pre[0] - np.mean(y2_train))
print('MAE for naive predictor: ' + str(mae))
sink_y_pre = ridge.predict(x)
rmse = np.sqrt(mean_squared_error(y2, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
test_y_pre = ridge.predict(x2_test)
test_rmse = np.sqrt(mean_squared_error(y2_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

############################## Principal Components ###################################
print ("====== Principal Components ======")
print ("=== ln_y ===")
pca = PCA(copy = True)
pca_fit = pca.fit(x1_train)
x1_pca = pca.fit_transform(x1_train)
reg = linear_model.LinearRegression()
pcreg = reg.fit(x1_pca,y1_train)
print(reg.score(x1_pca,y1_train))
x_pca = pca_fit.transform(x)
sink_y_pre = pcreg.predict(x_pca)
rmse = np.sqrt(mean_squared_error(y1, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
x1_test_pca = pca_fit.transform(x1_test)
test_y_pre = pcreg.predict(x1_test_pca)
test_rmse = np.sqrt(mean_squared_error(y1_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

print ("=== growth ===")
pca_fit = pca.fit(x2_train)
x2_pca = pca.fit_transform(x2_train)
reg = linear_model.LinearRegression()
pcreg = reg.fit(x2_pca,y2_train)
print(reg.score(x2_pca,y2_train))
x_pca = pca_fit.transform(x)
sink_y_pre = pcreg.predict(x_pca)
rmse = np.sqrt(mean_squared_error(y2, sink_y_pre))
print('RMSE for kitchen sink predictor: '+ str(rmse))
x2_test_pca = pca_fit.transform(x2_test)
test_y_pre = pcreg.predict(x2_test_pca)
test_rmse = np.sqrt(mean_squared_error(y2_test, test_y_pre))
print('Performance on test set: ' + str(test_rmse))

=== ln_y ===
('gcf', 'ext_bal', 'fem_emp', 'inf_mort', 'lexp', 'age_dep_young', 'urban', 'yrsoffc', 'military', 'competitiveness_leg', 'parliamentary', 'stability', 'effectiveness', 'regulation', 'corruption')
0.8987270727399517
MAE for naive predictor: 2.220446049250313e-16
RMSE for kitchen sink predictor: 0.2694374558652285
Performance on test set: 0.3247875479256552
=== growth ===
('gcf', 'ext_bal', 'inflation', 'tfr', 'yrsoffc', 'parliamentary', 'voice', 'effectiveness')
-0.04344698790079064
MAE for naive predictor: 1.3877787807814457e-17
RMSE for kitchen sink predictor: 0.020177753791363138
Performance on test set: 0.01864901908698803
=== ln_y ===
Best score: 0.8952336149717841
Best alpha: 20.09233002565047
MAE for naive predictor: 6.938893903907228e-17
RMSE for kitchen sink predictor: 0.27304195896211786
Performance on test set: 0.2997926643039591
=== growth ===
Best score: -0.1798079791124283
Best alpha: 61.35907273413173
MAE for naive predictor: 1.3877787807814457e-17
RMSE for 