## <font color='green'> Application of the Logit model (logistic regression) to Default Data

In [None]:
import os
os.chdir('/Users/hj020/Desktop/2022/EconomicAnalytics-master/Python_/Data')

import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

raw0 = pd.read_csv('Default.csv')

# drop the observations that contain missing values
raw0.dropna()

raw0.head()

### <font color='green'> 1) Produce descriptive statistics and visualize data

#### <font color='green'> i) .describe()
* panda .describe(): https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html

In [None]:
raw0.describe(include = 'all')

#### <font color='green'> ii) crosstab
* Crosstab: https://pbpython.com/pandas-crosstab.html
    - pd.crosstab(A,B): produce a frequency table with the groups in A on the rows and groups in B on the columns

In [None]:
pd.crosstab(raw0.student,raw0.default)

In [None]:
pd.crosstab(raw0.student,raw0.default).plot(kind='bar')

plt.title('Default Frequency')
plt.xlabel('Student status')
plt.ylabel('Frequency')
plt.show()

#### <font color='green'> iii) Histogram
* Histogram: https://jakevdp.github.io/PythonDataScienceHandbook/04.05-histograms-and-binnings.html

In [None]:
plt.figure(figsize=(10, 3), dpi=100)

plt.subplot(1, 2, 1)
plt.hist(raw0.balance.loc[raw0.default == 'No'],label='Non-defaulter',alpha=0.7) 
plt.hist(raw0.balance.loc[raw0.default == 'Yes'],label='Defaulter',alpha=0.7) 
plt.xlabel('Credit Card Balance')
plt.ylabel('Frequency')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(raw0.income.loc[raw0.default == 'No'],label='Non-defaulter',alpha=0.7) 
plt.hist(raw0.income.loc[raw0.default == 'Yes'],label='Defaulter',alpha=0.7) 
plt.xlabel('Annual Income')
plt.ylabel('Frequency')
plt.legend()

plt.show()

#### <font color='green'> iv) Boxplot
* boxplot: https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.boxplot.html

In [None]:
boxplot = raw0.boxplot(column='balance', by=['default'], figsize=(6,6))
boxplot = raw0.boxplot(column='income', by=['default'], figsize=(6,6))

### <font color='green'> 2) Run a logistic regression on the default data

* Two packages to run a logistic regression
    - statsmodels: ***.Logit
    - sklearn.linear_model: LogisticRegression

In [None]:
# create dummies
raw0.default=pd.get_dummies(raw0.default,drop_first=True) # default = 1
raw0.student=pd.get_dummies(raw0.student,drop_first=True) # student = 1

In [None]:
# Run a logistic regression
import statsmodels.api as sm
import statsmodels.formula.api as smf

Y = raw0.default
X = raw0.iloc[:,2:]
X = sm.add_constant(X)

logitres=sm.Logit(Y,X).fit() # plug in Y first; case sensitive: Logit (o) logit(x)

print(logitres.summary2())

In [None]:
# logistic regression with interaction and higer order

logitres2=smf.logit('default ~ student + income + balance + student*balance + student*income + np.power(income,2) + np.power(balance,2)', data=raw0).fit() # logit (o), Logit(x)

print(logitres2.summary2())

### <font color='green'> 3) Calculate/show default probability for students and non-students, separately, as a function of balance, holding income at its mean

* see the lecture note 6, page 6-7 (or textbook chapter 4.3)

In [None]:
grid0 = np.linspace(raw0.balance.min(), raw0.balance.max(), 100).reshape((100,1))
xx1 = np.concatenate((np.ones((100,1)),np.ones((100,1)),grid0,np.ones((100,1))*raw0.income.mean()), axis=1)
xx2 = np.concatenate((np.ones((100,1)),np.zeros((100,1)),grid0,np.ones((100,1))*raw0.income.mean()), axis=1)
prd1 = logitres.predict(xx1)
prd2 = logitres.predict(xx2)

plt.figure(figsize=(8, 7), dpi=80)
plt.plot(grid0,prd1,label='Student')
plt.plot(grid0,prd2,label='Non-student')
plt.xlabel('Balance')
plt.ylabel('Prob of Default')
plt.legend()
plt.show()

### <font color='green'> 4) Calculate marginal effect of a variable on default probability

* For the calculation of marginal effect of a continuous variable (e.g. balance or income) on default probability, 
    
    1. you can use the formula (1) on page 6 of the lecture note 6. 
    2. Or you can use ".get_margeff( )" function in statsmodels 
     * see: https://towardsdatascience.com/binary-logistic-regression-using-python-research-oriented-modelling-and-interpretation-49b025f1b510 and https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.DiscreteResults.get_margeff.html
     * However, remember that the five different types of the marginal effects calculated from the ".get_margeff( )" doesn't include the marginal effect computed by the formula (1).

In [None]:
meff=logitres.get_margeff(at = 'mean', method = 'dydx', dummy = True)
print(meff.summary())

### <font color='darkred'> HW6: Calculate the marginal effect of the student status on default probability, holding income and balance at their means, using the formula (2) on page 7 of the lecture note 6

* Note that the student variable is binary

### <font color='green'> 5) Evaluate the logit regression

#### <font color='green'> i) Out of sample prediction accuracy: test error

In [None]:
# Note: logit in statsmodels provides the summary table but logit in sklearn.linear_model doesn't.
# However, logit in sklearn.linear_model has some useful attributes like prediction score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
logreg = LogisticRegression(fit_intercept=0, solver = 'lbfgs').fit(X_train, y_train)

logreg.score(X_test, y_test)

#### <font color='green'> ii) Confusion Matrix
    
* https://en.wikipedia.org/wiki/Confusion_matrix
* https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = logreg.predict(X_test) # prediction
cm_logit = confusion_matrix(y_test, y_pred)
print(cm_logit)

## <font color='green'> Linear Discriminant Analysis (LDA), Quadratic Discriminant Analysis (QDA) and K-Nearest Neighbors (KNN)

### <font color='green'> Application of LDA and QDA to the default data

* LDA: https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html
* QDA: https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis
* Visualization of LDA \& QDA: https://scikit-learn.org/stable/auto_examples/classification/plot_lda_qda.html#sphx-glr-auto-examples-classification-plot-lda-qda-py

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
X_train=X_train.drop('const', axis=1) # An intercept will be automatically included in the model, so we need to remove it from X
X_test=X_test.drop('const', axis=1)

# test error
LDAres = LDA().fit(X_train, y_train)
QDAres = QDA().fit(X_train, y_train)

print(LDAres.score(X_test, y_test))
print(QDAres.score(X_test, y_test))

In [None]:
# confusion matrix
y_pred_LDA = LDAres.predict(X_test)

y_pred_QDA = QDAres.predict(X_test)

cm_LDA = confusion_matrix(y_test, y_pred_LDA)
print(cm_LDA)
cm_QDA = confusion_matrix(y_test, y_pred_QDA)
print(cm_QDA)

### <font color='green'> Application of KNN to the default data
* KNN: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
* Visualization of KNN: https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html#sphx-glr-auto-examples-neighbors-plot-classification-py

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNNC

# test error
KNNCres = KNNC(n_neighbors = 5).fit(X_train, y_train)
print(KNNCres.score(X_test, y_test))

In [None]:
# confusion matrix
y_pred_KNNC = KNNCres.predict(X_test)

cm_KNNC = confusion_matrix(y_test, y_pred_KNNC)
print(cm_KNNC)

### <font color='darkred'> HW6-2: Optimize/tune the number of neighbors of KNN using cross-validation
    
* Use the function "GridSearchCV" in scikitlearn:
    - Manual: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    - An example: https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
* Use "precision" to evaluate the prediction performance at each number of neighbors
* Use the default data
* Visualize your results using a graph which plot the prediction performances at the numbers of neigbors