Consider the dataset which contains the following:
* Survived - 1 for survived, else 0 for died
* Name of the person
* Sex/Gender
* Age
* Marital Status

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from statsmodels.api import Logit, add_constant

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
df = pd.read_csv('C:/Users/Karthik.Iyer/Downloads/AccelerateAI/Classification-Models-main/data/LR4.csv')
df.head()

Unnamed: 0,Survived,Name,Sex,Age,Marital Status
0,1,Patrick Breen,M,51.0,M
1,1,Margaret Breen,F,40.0,M
2,1,John Breen,M,14.0,S
3,1,Patrick Breen Jr.,M,9.0,S
4,1,Simon Preston Breen,M,8.0,S


In [3]:
# Check missing values
df.isnull().sum()

Survived          0
Name              0
Sex               0
Age               3
Marital Status    0
dtype: int64

There are missing values in Age

In [4]:
# Check missing values for Age
df[df['Age'].isnull()]

Unnamed: 0,Survived,Name,Sex,Age,Marital Status
61,0,Jacob Wolfinger,M,,M
80,0,Luis,M,,S
81,0,Salvador,M,,S


In [5]:
# Drop rows with missing values
df.dropna(axis=0, inplace=True)
df.isnull().sum()

Survived          0
Name              0
Sex               0
Age               0
Marital Status    0
dtype: int64

In [6]:
# Age is the only numerical variable. Lets check its correlation with y
y = df['Survived']
X = df[['Age']]

X.corrwith(y)

Age   -0.265835
dtype: float64

**Do not see a strong correlation with y**

In [7]:
# Lets check categories
df['Marital Status'].value_counts()

S    60
M    24
W     2
Name: Marital Status, dtype: int64

S indicates Single; M indicates Married; W indicates Widowed persons

In [8]:
# Create dummies
sex_dummy = pd.get_dummies(df['Sex'], prefix='Sex',drop_first=True)
marst_dummy = pd.get_dummies(df['Marital Status'], prefix='Mar_St',drop_first=True)

df = pd.concat([sex_dummy, marst_dummy, df], axis=1)
df.head()

Unnamed: 0,Sex_M,Mar_St_S,Mar_St_W,Survived,Name,Sex,Age,Marital Status
0,1,0,0,1,Patrick Breen,M,51.0,M
1,0,0,0,1,Margaret Breen,F,40.0,M
2,1,1,0,1,John Breen,M,14.0,S
3,1,1,0,1,Patrick Breen Jr.,M,9.0,S
4,1,1,0,1,Simon Preston Breen,M,8.0,S


In [9]:
# Drop the variables
df.drop(['Name','Sex','Marital Status'], axis=1, inplace=True)
df.head()

Unnamed: 0,Sex_M,Mar_St_S,Mar_St_W,Survived,Age
0,1,0,0,1,51.0
1,0,0,0,1,40.0
2,1,1,0,1,14.0
3,1,1,0,1,9.0
4,1,1,0,1,8.0


In [10]:
# Train the model with only Age
y = df['Survived']
X = df['Age']

X = sm.add_constant(X)
model = sm.Logit(y, X).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.650221
         Iterations 5


0,1,2,3
Dep. Variable:,Survived,No. Observations:,86.0
Model:,Logit,Df Residuals:,84.0
Method:,MLE,Df Model:,1.0
Date:,"Wed, 08 Jun 2022",Pseudo R-squ.:,0.05267
Time:,19:02:32,Log-Likelihood:,-55.919
converged:,True,LL-Null:,-59.028
Covariance Type:,nonrobust,LLR p-value:,0.01265

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.9674,0.381,2.536,0.011,0.220,1.715
Age,-0.0354,0.015,-2.367,0.018,-0.065,-0.006


In [11]:
# Check coefficients
model.params

const    0.967397
Age     -0.035385
dtype: float64

1. Consider the regression model equation as log(p/1-p) = beta0 + beta1 * Age and estimate coefficients beta0 and beta1 using MLE

**MLE Equation:**<br>
Survived = 0.9674 - 0.0354 * Age<br>
beta0 = 0.9674<br>
beta1 = -0.0354

2. Calculate the probability of survival for a new born (considering age=0)

In [12]:
odds = np.exp((0.9674 + (-0.0354*0)))
prob = round(odds/(1+odds),2)
prob

0.72