# DS-SF-33 | Unit Project 3: Machine Learning Modeling

In this project, you will perform a logistic regression on the admissions data we've been working with in Unit Projects 1 and 2.

In [55]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

import statsmodels.formula.api as smf

from sklearn import linear_model

In [56]:
df = pd.read_csv(os.path.join('..', '..', 'dataset', 'dataset-ucla-admissions.csv'))
df.dropna(inplace = True)

df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3.0
1,1,660.0,3.67,3.0
2,1,800.0,4.00,1.0
3,1,640.0,3.19,4.0
4,0,520.0,2.93,4.0
...,...,...,...,...
395,0,620.0,4.00,2.0
396,0,560.0,3.04,3.0
397,0,460.0,2.63,2.0
398,0,700.0,3.65,2.0


## Part A.  Frequency Table

> ### Question 1.  Create a frequency table for `prestige` and whether an applicant was admitted.

In [3]:
# TODO
pd.crosstab(df.prestige,df.admit)

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,28,33
2.0,95,53
3.0,93,28
4.0,55,12


In [4]:
pd.crosstab(df.prestige,df.admit,normalize=True)

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.070529,0.083123
2.0,0.239295,0.133501
3.0,0.234257,0.070529
4.0,0.138539,0.030227


In [5]:
pd.crosstab(df.prestige,df.admit,normalize=True).sum()

admit
0    0.68262
1    0.31738
dtype: float64

In [6]:
pd.crosstab(df.prestige,df.admit,normalize='index')

admit,0,1
prestige,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,0.459016,0.540984
2.0,0.641892,0.358108
3.0,0.768595,0.231405
4.0,0.820896,0.179104


## Part B.  Variable Transformations

> ### Question 2.  Create a one-hot encoding for `prestige`.

In [57]:
# Let's rescast prestige as int....
df.prestige = df.prestige.astype(int)
df

Unnamed: 0,admit,gre,gpa,prestige
0,0,380.0,3.61,3
1,1,660.0,3.67,3
2,1,800.0,4.00,1
3,1,640.0,3.19,4
4,0,520.0,2.93,4
...,...,...,...,...
395,0,620.0,4.00,2
396,0,560.0,3.04,3
397,0,460.0,2.63,2
398,0,700.0,3.65,2


In [58]:
one_hot = pd.get_dummies(df.prestige)
one_hot

Unnamed: 0,1,2,3,4
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
395,0,1,0,0
396,0,0,1,0
397,0,1,0,0
398,0,1,0,0


In [59]:
df = df.join(one_hot)
df

Unnamed: 0,admit,gre,gpa,prestige,1,2,3,4
0,0,380.0,3.61,3,0,0,1,0
1,1,660.0,3.67,3,0,0,1,0
2,1,800.0,4.00,1,1,0,0,0
3,1,640.0,3.19,4,0,0,0,1
4,0,520.0,2.93,4,0,0,0,1
...,...,...,...,...,...,...,...,...
395,0,620.0,4.00,2,0,1,0,0
396,0,560.0,3.04,3,0,0,1,0
397,0,460.0,2.63,2,0,1,0,0
398,0,700.0,3.65,2,0,1,0,0


> ### Question 3.  How many of these binary variables do we need for modeling?

Answer:

> ### Question 4.  Why are we doing this?

Answer:

> ### Question 5.  Add all these binary variables in the dataset and remove the now redundant `prestige` feature.

In [60]:
# TODO
df2 = df.drop('prestige', 1)

## Part C.  Hand calculating odds ratios

In [13]:
df2

Unnamed: 0,admit,gre,gpa,1,2,3,4
0,0,380.0,3.61,0,0,1,0
1,1,660.0,3.67,0,0,1,0
2,1,800.0,4.00,1,0,0,0
3,1,640.0,3.19,0,0,0,1
4,0,520.0,2.93,0,0,0,1
...,...,...,...,...,...,...,...
395,0,620.0,4.00,0,1,0,0
396,0,560.0,3.04,0,0,1,0
397,0,460.0,2.63,0,1,0,0
398,0,700.0,3.65,0,1,0,0


In [63]:
df2 = df2.rename(columns= {1: 'prestige_1', 2: 'prestige_2', 3: 'prestige_3', 4: 'prestige_4'})

In [64]:
df2

Unnamed: 0,admit,gre,gpa,prestige_1,prestige_2,prestige_3,prestige_4
0,0,380.0,3.61,0,0,1,0
1,1,660.0,3.67,0,0,1,0
2,1,800.0,4.00,1,0,0,0
3,1,640.0,3.19,0,0,0,1
4,0,520.0,2.93,0,0,0,1
...,...,...,...,...,...,...,...
395,0,620.0,4.00,0,1,0,0
396,0,560.0,3.04,0,0,1,0
397,0,460.0,2.63,0,1,0,0
398,0,700.0,3.65,0,1,0,0


Let's develop our intuition about expected outcomes by hand calculating odds ratios.

> ### Question 6.  Create a frequency table for `prestige = 1` and whether an applicant was admitted.

In [65]:
# TODO
pd.crosstab(df2['prestige_1'],df2.admit)

admit,0,1
prestige_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,243,93
1,28,33


> ### Question 7.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the most prestigious undergraduate schools.

In [22]:
# TODO
pd.crosstab(df2['prestige_1'],df2.admit,normalize='index')

admit,0,1
prestige_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.723214,0.276786
1,0.459016,0.540984


In [25]:
pd.crosstab(df2['prestige_1'],df2.admit,normalize='index').apply(lambda p: p/(1-p), axis=1)

admit,0,1
prestige_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2.612903,0.382716
1,0.848485,1.178571


In [31]:
.541/(1-.541)

1.178649237472767

> ### Question 8.  Now calculate the odds of admission for undergraduates who did not attend a #1 ranked college.

In [32]:
# TODO
pd.crosstab(df2['prestige_1'],df2.admit,normalize='index').apply(lambda p: p/(1-p), axis=1)

admit,0,1
prestige_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2.612903,0.382716
1,0.848485,1.178571


> ### Question 9.  Finally, what's the odds ratio?

In [34]:
# TODO
1.17864/.38

0.1464715682135923

> ### Question 10.  Write this finding in a sentence.

Answer:

> ### Question 11.  Use the frequency table above to calculate the odds of being admitted to graduate school for applicants that attended the least prestigious undergraduate schools.  Then calculate their odds ratio of being admitted to UCLA.  Finally, write this finding in a sentence.

In [35]:
# TODO
pd.crosstab(df2['prestige_4'],df2.admit,normalize='index').apply(lambda p: p/(1-p), axis=1)

admit,0,1
prestige_4,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.894737,0.527778
1,4.583333,0.218182


In [36]:
.218182/ overall mean of admit

0.0476033489166072

Answer:

## Part C. Analysis using `statsmodels`

> ### Question 12.  Fit a logistic regression model predicting admission into UCLA using `gre`, `gpa`, and the `prestige` of the undergraduate schools.  Use the highest prestige undergraduate schools as your reference point.

In [67]:
# TODO
df2['const'] = 1
X = df2[['const', 'gre', 'gpa', 'prestige_2', 'prestige_3', 'prestige_4']]
y = df2['admit']
logit = smf.Logit(y, X)
sm = logit.fit()
sm.params

Optimization terminated successfully.
         Current function value: 0.573854
         Iterations 6


const        -3.876854
gre           0.002218
gpa           0.779337
prestige_2   -0.680137
prestige_3   -1.338677
prestige_4   -1.553411
dtype: float64

> ### Question 13.  Print the model's summary results.

In [68]:
# TODO
print(sm.summary())

                           Logit Regression Results                           
Dep. Variable:                  admit   No. Observations:                  397
Model:                          Logit   Df Residuals:                      391
Method:                           MLE   Df Model:                            5
Date:                Wed, 19 Apr 2017   Pseudo R-squ.:                 0.08166
Time:                        20:09:47   Log-Likelihood:                -227.82
converged:                       True   LL-Null:                       -248.08
                                        LLR p-value:                 1.176e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.8769      1.142     -3.393      0.001      -6.116      -1.638
gre            0.0022      0.001      2.028      0.043    7.44e-05       0.004
gpa            0.7793      0.333      2.344      0.0

> ### Question 14.  What are the odds ratios of the different features and their 95% confidence intervals?

In [69]:
# TODO
print(np.exp(sm.params))
print(sm.conf_int())

const         0.020716
gre           1.002221
gpa           2.180027
prestige_2    0.506548
prestige_3    0.262192
prestige_4    0.211525
dtype: float64
                   0         1
const      -6.116077 -1.637631
gre         0.000074  0.004362
gpa         0.127619  1.431056
prestige_2 -1.301337 -0.058936
prestige_3 -2.014579 -0.662776
prestige_4 -2.371624 -0.735197


> ### Question 15.  Interpret the odds ratio for `prestige = 2`.

Answer:

> ### Question 16.  Interpret the odds ratio of `gpa`.

Answer:

> ### Question 17.  Assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [71]:
# TODO
X1 = [[1, 800, 4, 0, 0, 0]]
X2 = [[1, 800, 4, 1, 0, 0]]
X3 = [[1, 800, 4, 0, 1, 0]]
X4 = [[1, 800, 4, 0, 0, 1]]
print(sm.predict(X1))
print(sm.predict(X2))
print(sm.predict(X3))
print(sm.predict(X4))

[ 0.73403998]
[ 0.58299512]
[ 0.41983282]
[ 0.36860803]


In [None]:
sample = pd.DataFrame something fromdict and then force to be the right order

Answer:

## Part D. Moving the model from `statsmodels` to `sklearn`

> ### Question 18.  Let's assume we are satisfied with our model.  Remodel it (same features) using `sklearn`.  When creating the logistic regression model with `LogisticRegression(C = 10 ** 2)`.

In [50]:
# TODO
from sklearn.linear_model import LogisticRegression
sk = LogisticRegression(C = 10 ** 2)
sk.fit(X,y)


LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

> ### Question 19.  What are the odds ratios for the different variables and how do they compare with the odds ratios calculated with `statsmodels`?

In [51]:
# TODO
print(np.exp(sk.coef_))

[[ 1.00216055  1.96041259  0.53321936  0.28586733  0.20829663]]


Answer:

> ### Question 20.  Again, assuming a student with a GRE of 800 and a GPA of 4.  What is his/her probability of admission  if he/she come from a tier-1, tier-2, tier-3, or tier-4 undergraduate school?

In [54]:
# TODO
print(sk.predict_proba(X1))
print(sk.predict_proba(X2))
print(sk.predict_proba(X3))
print(sk.predict_proba(X4))sr

[[ 0.28814605  0.71185395]]
[[ 0.43153702  0.56846298]]
[[ 0.58608936  0.41391064]]
[[ 0.66024514  0.33975486]]


Answer: