# Import relevant libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()                             # Apply a skin to matplotlib plots


# Load the data

In [21]:
df = pd.read_csv('Binary predictors.csv')
print(df)                                      # View the DataFrame
print(df.columns)                              # ['SAT', 'Admitted', 'Gender']
print(df.shape)                                # 168 x 3

      SAT Admitted  Gender
0    1363       No    Male
1    1792      Yes  Female
2    1954      Yes  Female
3    1653       No    Male
4    1593       No    Male
5    1755      Yes  Female
6    1775      Yes  Female
7    1887      Yes  Female
8    1893      Yes  Female
9    1580       No    Male
10   1857      Yes  Female
11   1880      Yes  Female
12   1664      Yes  Female
13   1364       No    Male
14   1693       No    Male
15   1850      Yes  Female
16   1633       No    Male
17   1634       No    Male
18   1636      Yes  Female
19   1855      Yes  Female
20   1987      Yes  Female
21   1997      Yes    Male
22   1422       No  Female
23   1508       No  Female
24   1720      Yes  Female
25   1879      Yes    Male
26   1634      Yes  Female
27   1802      Yes    Male
28   1849      Yes  Female
29   1764      Yes  Female
..    ...      ...     ...
138  1412       No  Female
139  1557       No    Male
140  1821      Yes    Male
141  1760      Yes  Female
142  1685      Yes    Male
1

# Perform the mapping of categorical variables. 
DV: Admitted [Categorical/ nonmetric]

IV: SAT [Numeric/metric] + Gender [Categorical/ nonmetric]



In [35]:
df_new = df.copy()
print(id(df))                # 2133977826808
print(id(df_new))            # 2133977825352


# The DV- "Admitted" and the IV- "Gender" are categorical
# Lets first see the number of unique values in each of these columns. 
print(df_new.Admitted.unique())     # ['No' 'Yes'] - Only 2 values
print(df_new.Gender.unique())       # ['Male' 'Female'] - Only 2 values

# Map it suitably
df_new.Admitted = df_new.Admitted.map({'Yes': 1, 'No': 0})
df_new.Gender = df_new.Gender.map({'Female' : 1, 'Male' : 0})
df_new

# In statistical terminology, 'Male' is the baseline or the reference group. 

2133977826248
2133979524568
['No' 'Yes']
['Male' 'Female']


Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0
5,1755,1,1
6,1775,1,1
7,1887,1,1
8,1893,1,1
9,1580,0,0


# Declare the IV and DV


In [32]:
y = df_new.Admitted
x1 = df_new[['SAT', 'Gender']]
# Since Statsmodels module doesnt consider the constant term b0, in regression, we need to make explicit provision for it, 
# by adding a dummy column 'const' for x0 which comprises of only 1's. 
x = sm.add_constant(x1)
print(x)

     const   SAT  Gender
0      1.0  1363       0
1      1.0  1792       1
2      1.0  1954       1
3      1.0  1653       0
4      1.0  1593       0
5      1.0  1755       1
6      1.0  1775       1
7      1.0  1887       1
8      1.0  1893       1
9      1.0  1580       0
10     1.0  1857       1
11     1.0  1880       1
12     1.0  1664       1
13     1.0  1364       0
14     1.0  1693       0
15     1.0  1850       1
16     1.0  1633       0
17     1.0  1634       0
18     1.0  1636       1
19     1.0  1855       1
20     1.0  1987       1
21     1.0  1997       0
22     1.0  1422       1
23     1.0  1508       1
24     1.0  1720       1
25     1.0  1879       0
26     1.0  1634       1
27     1.0  1802       0
28     1.0  1849       1
29     1.0  1764       1
..     ...   ...     ...
138    1.0  1412       1
139    1.0  1557       0
140    1.0  1821       0
141    1.0  1760       1
142    1.0  1685       0
143    1.0  1773       1
144    1.0  1826       1
145    1.0  1565       1


# Perform the logistic regression

In [34]:
reg_log = sm.Logit(y, x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 07 Jan 2019",Pseudo R-squ.:,0.8249
Time:,13:24:01,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
,,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


# Inference: 
1. The model is significant, as the LLR p-value is very very low i.e. 0.000.
2. Also the "Gender" variable is significant, as its p-value = 0.022 which is less than 0.05. 
3. The exact model is:
            => ln(odds) = -68.3489 + 0.0406*SAT + 1.9449*Gender
  
  Assume that there are 2 students who got the same SAT score
  
            i.e. SAT2 - SAT1 = 0
  Assume that there is a one unit change in the Gender of the students who got the same SAT score i.e. One is M while other is F
            Gender2 = 1 (i.e. Female) and Gender1 = 0 (i.e. Male)
  
  Interpretation of Logistic regression variables isnt direct. We have to follow the difference method. 
           
           => ln(odds2) = -68.3489 + 0.0406*SAT2 + 1.9449*Gender2
              ln(odds1) = -68.3489 + 0.0406*SAT1 + 1.9449*Gender1
             -            +        -             - 
             ------------------------------------------------------------------------
             ln(odds2) - ln(odds1) = 0.0406(SAT2 - SAT1) + 1.9449(Gender2 - Gender1)
             ------------------------------------------------------------------------
             
           => ln(odds2/odds1) = 1.9449
           => odds2/odds1 = e^1.9449
           => odds2/odds1 = 6.9929 ~= 7.0
           => odds_female/odds_male = 7.0
           => odds_female = 7.0 * odds_male
           
  Conclusion: 
      odds of the female getting admitted is 7.0 times the odds of a male getting admitted, given the same SAT score. In other words, the chance of females getting admitted is more than the chance of the male getting admitted. This however is not true in real life, as universities do have fixed quotas for each gender. 
             
             
             