### **CH 12 GENERALIZED LINEAR MODELS**
### 13.4.1 LOGISTIC REGRESSION

In [None]:
from google.colab import drive
drive.mount('/gdrive')
folder = "/gdrive/My Drive/Python Practice/Datasets"

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats

sales_train = pd.read_csv(folder + "/clothing_sales_training.csv")
sales_test = pd.read_csv(folder + "/clothing_sales_test.csv")

X = pd.DataFrame(sales_train[['Days','Web']])
X = sm.add_constant(X)
y = pd.DataFrame(sales_train[['CC']])

In [None]:
logreg01 = sm.Logit(y,X).fit()
logreg01.summary2()

In [None]:
X_test = pd.DataFrame(sales_test[['Days','Web']])
X_test = sm.add_constant(X_test)
y_test = pd.DataFrame(sales_test[['CC']])
logreg01_test = sm.Logit(y_test, X_test).fit()
logreg01_test.summary2()

### 13.6.1 POISSON REGRESSION

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.tools.tools as stattools

churn = pd.read_csv(folder + "/churn")
churn_ind = pd.get_dummies(churn['Churn'], drop_first=True)

X = pd.DataFrame(churn_ind)
X = sm.add_constant(X)
X.columns = ['const', 'Churn = True']
y = pd.DataFrame(churn[['CustServ Calls']])

poisreg01 = sm.GLM(y, X, family=sm.families.Poisson()).fit()
poisreg01.summary()

### HANDS-ON ANALYSIS
### 20
Build a logistic regression model to predict the income of a person based on their age, education (as a number, with variable education.num), and the hours worked per week. 
<br>Obtain the summary of the model.

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.tools.tools as stattools

adult = pd.read_csv(folder + "/Adult")

X_names = ['age', 'education-num', 'hours-per-week']
X = pd.DataFrame(adult[X_names])
X = sm.add_constant(X)
income_ind = pd.get_dummies(adult['income'], drop_first=True)
Y = pd.DataFrame(income_ind)

logreg20 = sm.Logit(Y,X).fit()
print(logreg20.summary2())

### 21
Are there any variables that should be removed from the model from the previous exercise? 
<br>If so, remove the variables and rerun the model.

In [None]:
# No remove required.

### 22
Write the descriptive form of the final logistic regression model from the previous exercise.

In [None]:
#p_hat(income) = exp(-8.4611 + (0.0459 * age) + (0.3449 * education-num) + (0.0423 * hours-per-week)) / ( 1 + exp(-8.4611 + (0.0459 * age) + (0.3449 * education-num) + (0.0423 * hours-per-week)) )

### 23
Interpret the coefficient of the age variable.

In [None]:
# As age increase by 1, probability of >50K income increase by exp(0.0459) / 1 + exp(0.0459)

### 24
Find the impact on the probability of having high income for every 10 years a person is older.

In [None]:
#exp(0.459) / 1 + exp(0.459)

In [None]:
np.exp(10*logreg20.params[1]) # ????

### 25
Interpret the coefficient of the education.num variable.

In [None]:
# As education-num increase by 1, probability of >50K income increase by exp(0.3449) / 1 + exp(0.3449)

### 26
Find the impact on the probability of having high income for every four more years of education a person has.


In [None]:
#exp(1.3796) / 1 + exp(1.3796)

### 27
Interpret the coefficient of the hours.per.week variable.

In [None]:
# As hours-per-week increase by 1, probability of >50K income increase by exp(0.0423) / 1 + exp(0.0423)

### 28
Find the impact on the probability of having high income for every five more hours per week a person works.


In [None]:
#exp(0.2115) / 1 + exp(0.2115)

### 29 
Obtain the predicted values using the model from the previous exercise. 
<br>Compare the predicted values to the actual values.

In [None]:
ypred = pd.DataFrame(logreg20.predict(X), columns=['predicted']) 
compare = pd.concat((Y, ypred), axis=1)
compare.head()

In [None]:
Income = [">50K.", "<=50K."]
y_pred = pd.DataFrame([Income[i<0.5] for i in logreg20.predict(X)])
pd.crosstab(adult["income"], y_pred[0], rownames = ["Actual Income"], colnames = ["Predicted Income"], margins=True)

### 30
Build a Poisson regression model to predict the years of education a person has (using the variable education.num) based on a person’s age and the hours they work per week. 
<br>Obtain the summary of the model.

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.tools.tools as stattools


adult = pd.read_csv(folder + "/Adult")

X_names = ['age', 'hours-per-week']
X = pd.DataFrame(adult[X_names])
X = sm.add_constant(X)
X.columns = ['const', 'age', 'hours-per-week']
Y = pd.DataFrame(adult[['education-num']])

poisreg30 = sm.GLM(Y,X, family=sm.families.Poisson()).fit()
print(poisreg30.summary())

### 31
Are there any variables that should be removed from the model from the previous exercise? 
<br>If so, remove the variables and rerun the model.


In [None]:
# No

### 32
Write the descriptive form of the final Poisson regression model from the previous exercise.

In [None]:
# y_hat = exp(2.1739 + (0.0004 * age) + (0.0030 * hours-per-week))

### 33
Obtain the predicted values using the model from the previous exercise. <br>Compare the predicted values to the actual values.

In [None]:
ypred = pd.DataFrame(poisreg30.predict(X), columns=['predicted']) 
compare = pd.concat((Y, ypred), axis=1)
compare.head()

In [None]:
y_pred = np.array(poisreg30.predict(X).round())
pd.crosstab(adult['education-num'], y_pred, rownames=['Actual Edu Num'], colnames = ['Predicted Edu Num'], margins=True)