# CH 11 REGRESSION MODELING

In [None]:
from google.colab import drive
drive.mount('/gdrive')
folder = "/gdrive/My Drive/Python Practice/Datasets"

# HANDS-ON ANALYSIS


---
# Use **adult** Dataset below.

---

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

adult = pd.read_csv(folder + '/Adult')

#25
Partition the data set into a training set and a test set, each containing about half of the records.

In [None]:
adult_train, adult_test = train_test_split(adult, test_size=0.5, random_state = 7)

#26
Run a regression model to predict Hours per Week using Age and Education Num. <br>
Obtain a summary of the model. <br>
Are there any predictor variables that should not be in the model?


In [None]:
X_train = pd.DataFrame(adult_train[['age', 'education-num']])
X_train = sm.add_constant(X_train)
Y_train = pd.DataFrame(adult_train[['hours-per-week']])

model1 = sm.OLS(Y_train, X_train).fit()
print(model1.summary()) 
# Both 'Age' and 'Edu-Num' are significant variable to predict 'Hour per Week'

#27
Validate the model from the previous exercise.


In [None]:
X_test = pd.DataFrame(adult_test[['age', 'education-num']])
X_test = sm.add_constant(X_test)
Y_test = pd.DataFrame(adult_test[['hours-per-week']])

model1_test = sm.OLS(Y_test, X_test).fit()
print(model1_test.summary())

In [None]:
#predict test by model 1 
model1_pred = model1.predict(X_test) 

In [None]:
# Standard error of estimate (sqrt(MSE))
print('\n s (root MSE): %.4f' % (np.sqrt(model1.scale)))

In [None]:
# MAE (Mean Absolute Error)
import sklearn.metrics as met
print('\n MAE : %.4f' % met.mean_absolute_error(y_true = Y_test, y_pred = model1_pred))

#28
Use the regression equation to complete this sentence: “The estimated Hours per Week equals….”


In [None]:
print('The estimated Hours per Week = \n%.4f + %.4f * (Age) + %.4f * (Education Num)' % (model1.params[0],model1.params[1], model1.params[2]))

#29
Interpret the coefficient for Age.

In [None]:
print("As age increases by one unit, hours per week increases by 0.0604 given the other variable is fixed.")

#30
Interpret the coefficient for Education Num.


In [None]:
print("As education-num increases by one unit, hours per week increases by 0.6552 given the other variable is fixed.")

#31
Find and interpret the value of s.

In [None]:
print("s (square root of MSE): %.4f \nThe model's typical prediction error between prediction value and actual value is %.4f hours" % (np.sqrt(model1.scale), np.sqrt(model1.scale)))

#32
Find and interpret R_adj.


In [None]:
print("R_adj : %.4f \nBased on R_adj, we don't see any correlation between predictor variables" % (model1.rsquared_adj))

#33
Find  MAE_baseline and MAE_Regression, and determine whether the regression model outperformed its baseline model.

In [None]:
print('MAE_base : %.4f,\t MAE_reg : %.4f ' % (
    np.mean(abs(Y_test - np.mean(Y_test))),
    met.mean_absolute_error(y_true = Y_test, y_pred = model1_pred)
))
# Since MAE_base < MAE_reg, Baseline performs better than Regression model

In [None]:
############################################## 여기서 부터 다시 정리 #######################################################

---
# Use **bank_reg_training**, **bank_reg_test** Dataset below.

---

#34
Use the training set to run a regression predicting Credit Score, based on Debt‐to‐Income Ratio and Request Amount. <br>
Obtain a summary of the model. <br>
Do both predictors belong in the model?

In [None]:
bank_train = pd.read_csv(folder + '/bank_reg_training')
bank_test = pd.read_csv(folder + '/bank_reg_test')

X = pd.DataFrame(bank_train[['Debt-to-Income Ratio','Request Amount']])
X = sm.add_constant(X)
Y = pd.DataFrame(bank_train[['Credit Score']])

X_test = pd.DataFrame(bank_test[['Debt-to-Income Ratio','Request Amount']])
X_test = sm.add_constant(X_test)
Y_test = pd.DataFrame(bank_test[['Credit Score']])

model02 = sm.OLS(Y, X).fit()
print(model02.summary())

In [None]:
# Both predictors belong in the model

#35
Validate the model from the previous exercise.


In [None]:
model02_test = sm.OLS(Y_test, X_test).fit()
print(model02_test.summary())

In [None]:
ypred02 = model02.predict(X_test) 

In [None]:
# Standard error of estimate (sqrt(MSE))
print('\n s (root MSE): %.4f' % (np.sqrt(model02.scale)))

In [None]:
# MAE (Mean Absolute Error)
import sklearn.metrics as met
print('\n MAE : %.4f' % met.mean_absolute_error(y_true = Y_test, y_pred = ypred02))

#36
Use the regression equation to complete this sentence: “The estimated Credit Score equals….”


In [None]:
print('The estimated Credit Score = \n%.4f %.4f * (Debt-to-Income Ratio) + %.4f * (Request Amount)' % (model02.params[0],model02.params[1], model02.params[2]))

#37
Interpret the coefficient for Debt‐to‐Income Ratio.


In [None]:
# Negatively Significant predictor

#38
Interpret the coefficient for Request Amount.



In [None]:
# Positively and weakly Significant predictor

#39
Find and interpret the value of s.

In [None]:
print("s (square root of MSE): %.4f \nThe model's typical prediction error between prediction value and actual value is %.4f hours" % (np.sqrt(model02.scale), np.sqrt(model02.scale)))

#40
Find and interpret R_adj

In [None]:
print("R_adj : %.4f \nBased on R_adj, we don't see any corelation between predictor variables" % (model02_test.rsquared_adj))

#41
Find  MAE_baseline and MAE_Regression, and determine whether the regression model outperformed its baseline model.

In [None]:
print('MAE_base : %.4f,\t MAE_reg : %.4f ' % (
    np.mean(abs(Y_test - np.mean(Y_test))),
    met.mean_absolute_error(y_true = Y_test, y_pred = ypred02)
))
# Since MAE_base > MAE_reg, Regression model performs better than baseline model

#42
Construct a regression model for predicting Interest, using Request Amount. <br>
Obtain a summary of the model.

In [None]:
bank_train = pd.read_csv(folder + '/bank_reg_training')
bank_test = pd.read_csv(folder + '/bank_reg_test')

X = pd.DataFrame(bank_train[['Request Amount']])
X = sm.add_constant(X)
Y = pd.DataFrame(bank_train[['Interest']])

#X_test = pd.DataFrame(bank_test[['Request Amount']])
#X_test = sm.add_constant(X_test)
#Y_test = pd.DataFrame(bank_test[['Interest']])

model03 = sm.OLS(Y, X).fit()
print(model03.summary())

#43
Explain what is unusual with your results from the previous exercise.


In [None]:
#It is unusual that R is equal to 1, which means they are 100% correlated with each other. 

#44
Construct a scatterplot of Interest against Request Amount. <br>
Describe the relationship between the variables. <br>
Explain how this relationship explains the unusual results from your regression model.


In [None]:
import matplotlib.pyplot as plt

plt.scatter(X[['Request Amount']], Y)
plt.show()

In [None]:
# Describe