## Logistic Regression

Possible outcomes are not numerical, but rather categorical

### Step 1: Import relevant Libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Step 2: Load the data

In [3]:
raw_data = pd.read_csv('Admittance.csv')
raw_data

Unnamed: 0,SAT,Admitted
0,1363,No
1,1792,Yes
2,1954,Yes
3,1653,No
4,1593,No
...,...,...
163,1722,Yes
164,1750,Yes
165,1555,No
166,1524,No


### Step 3: Map the Categorical Data

In [4]:
# Map the categorical data to 1s and 0s
data = raw_data.copy()
data['Admitted'] = data['Admitted'].map({'Yes':1, 'No': 0})
data

Unnamed: 0,SAT,Admitted
0,1363,0
1,1792,1
2,1954,1
3,1653,0
4,1593,0
...,...,...
163,1722,1
164,1750,1
165,1555,0
166,1524,0


### Step 4: Build the Regression

In [6]:
# Define dependent and independent variable
y = data['Admitted']
x1 = data['SAT']

In [10]:
# Add a constant
x = sm.add_constant(x1)

# Build the fit
# StatsModels uses machine learning to fit the regression
# Max iteration is 35. At some point the model will stop learning
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.137766
         Iterations 10


In [11]:
results_log.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 28 May 2021",Pseudo R-squ.:,0.7992
Time:,02:59:11,Log-Likelihood:,-23.145
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.805000000000001e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-69.9128,15.737,-4.443,0.000,-100.756,-39.070
SAT,0.0420,0.009,4.454,0.000,0.024,0.060


### Interpretting the Summary

* Method: MLE - Maximum Likelihood Estimation. The bigger the likelihood function is, the higher probability that our model is correct. 
* Log-likelihood - almost but not always negative. Bigger it is the better
* LL-Null - the log likelihood of a model which has no independent variables. Compare with log-likelihood
* LLR p-value - measures if our model is statistically different from LL-null aka a useless model. 0 is better 
* Pseudo R-squared - McFadden - good is between 0.2 and 0.4

## Binary Predictor

### Step 1: Load the data and Map the Categorical Data

In [16]:
raw_data = pd.read_csv('Binary Predictor.csv')
data = raw_data.copy()
data['Admitted'] = data['Admitted'].map({'Yes':1, 'No': 0})
data['Gender'] = data['Gender'].map({'Female':1, 'Male': 0})
data

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0
...,...,...,...
163,1722,1,1
164,1750,1,0
165,1555,0,0
166,1524,0,0


### Step 2: Prepare the Regression (one Variable)

In [17]:
y = data['Admitted']
x1 = data['Gender']

In [22]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

# log(odds) = -0.64 + 2.08*Gender
# log(odds_female/odds_male) = 2.08(Gender2_female - Gender1_male)
# odds_female = 7.99*odds_male

# np.exp(coef_var)
np.exp(2.08)

# It is 8 times more likely that a female candidate to be admitted than male candidates

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


8.004468914296353

### Step 3: Prepare the Regression (two Variable)

In [24]:
y = data['Admitted']
x1 = data[['SAT', 'Gender']]

x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

#np.exp(1.9449)

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Fri, 28 May 2021",Pseudo R-squ.:,0.8249
Time:,03:28:55,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


In [25]:
# Given the same SAT score, a female is 7 times higher odds than male to be admitted

### Step 5: Check Accuracy

In [29]:
# Check Accuracy
np.set_printoptions(formatter={'float':lambda x: "{0:0.2f}".format(x)})
results_log.predict()

# Round values below 0.5 and above 0.5

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

In [31]:
np.array(data['Admitted'])
# If 80% of the predicted values coincide with the actual values, we say that the model has 80% accuracy

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [35]:
results_log.pred_table()
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0', 1:'Actual 1'})
cm_df

# Model is correct 159 out of 168 cases or 96.4% Accuracy

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


### Step 6: Test the Regression

In [37]:
# Testing the model and assessing its accuracy
test = pd.read_csv('Test dataset.csv')
test

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male
5,1556,Yes,Female
6,1731,Yes,Female
7,1809,Yes,Female
8,1930,Yes,Female
9,1708,Yes,Male


In [38]:
test['Admitted'] = test['Admitted'].map({'Yes':1, 'No': 0})
test['Gender'] = test['Gender'].map({'Female':1, 'Male': 0})
test

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0
5,1556,1,1
6,1731,1,1
7,1809,1,1
8,1930,1,1
9,1708,1,0


In [50]:
test_actual = test['Admitted']
test_data =test.drop(['Admitted'],axis=1) 
#test_data = test_data[x.columns.values] #SAT and Gender should be at the same order as the training
test_data = sm.add_constant(test_data) 
test_data

Unnamed: 0,const,SAT,Gender
0,1.0,1323,0
1,1.0,1725,1
2,1.0,1762,1
3,1.0,1777,0
4,1.0,1665,0
5,1.0,1556,1
6,1.0,1731,1
7,1.0,1809,1
8,1.0,1930,1
9,1.0,1708,0


In [57]:
# Create confusion matrix
def confusion_matrix(data,actual_values,model):
    pred_values = model.predict(data)
    bins = np.array([0,0.5,1])
    bins.shape
    # Create histogram, where if val is between 0 and 0.5 it will be 0
    # and if between 0.5 and 1 it will be 1
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm, accuracy

In [58]:
cm = confusion_matrix(test_data, test_actual, results_log)
cm

(array([[5.00, 1.00],
        [1.00, 12.00]]),
 0.8947368421052632)

In [59]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0', 1:'Actual 1'})
cm_df 

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5.0,1.0
Actual 1,1.0,12.0


In [62]:
print ('Misclassification rate: ' + str((1+1)/19))

Misclassification rate: 0.10526315789473684
