# GLM - Logistic Regression

## Required modules

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import pymc3 as pm

%matplotlib inline

## Data

In [None]:
college_raw_df = pd.read_csv("http://www-bcf.usc.edu/~gareth/ISL/College.csv", index_col = [0])

### Cleaning

In [None]:
college_df = college_raw_df.loc[:, ["Private", "Apps", "Accept", "Expend"]]
college_df["Accept.Rate"] = (college_df["Accept"] / college_df["Apps"]) * 100
college_df["Private"] = college_df["Private"].apply(lambda x: True if x == "Yes" else False)
college_df = college_df.drop(["Apps", "Accept"], axis = 1)

### Add constant

In [None]:
college_df["intercept"] = 1

## y X split

In [None]:
y = college_df["Private"]
X = college_df.drop(["Private"], axis = 1)

## Model specification

In [None]:
with pm.Model() as model:
    
    # priors
    alpha = pm.Normal("alpha", mu = 0, tau = 1/(1000**2))
    beta = pm.Normal("beta", mu = 0, tau = 1/(1000**2), shape = 2)
    
    # expected value
    lgt = alpha + beta[0]*X["Accept.Rate"].values + beta[1]*X["Expend"].values
    
    # likelihood
    y_obs = pm.Bernoulli("y_obs", logit_p = lgt, observed = y.values)

### Samples

In [None]:
with model:
    trace = pm.sample(1000)

### Plot

In [None]:
pm.traceplot(trace)

In [None]:
pm.plot_posterior(trace)

## Compare with frequentist approach

In [None]:
sm.Logit(y, X.loc[:, ["intercept", "Accept.Rate", "Expend"]]).fit().summary()