In [74]:
# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# machine learning stack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#statsmodels package gives a more detailed output:
import statsmodels.discrete.discrete_model as sm

# evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report  

# miscellaneous
import warnings
warnings.filterwarnings("ignore")

In [32]:
full_data = pd.read_csv('../data/train.csv')

In [33]:
full_data.shape

(891, 12)

In [34]:
full_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
full_data['Sex_category'] = full_data['Sex'].map({'female':0, 'male':1})
full_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_category
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


Building a model

In [57]:
# features and target
X = full_data[['Pclass', 'Sex_category']]
y = full_data['Survived']

In [58]:
X.shape, y.shape

((891, 2), (891,))

In [59]:
#stratify is used to keep datasets in same(nearer) proportion
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=101,
    stratify=y
)

In [60]:
y_train.value_counts(normalize=True)

0    0.616573
1    0.383427
Name: Survived, dtype: float64

In [61]:
y_test.value_counts(normalize=True)

0    0.614525
1    0.385475
Name: Survived, dtype: float64

In [62]:
X_train

Unnamed: 0,Pclass,Sex_category
702,3,0
776,3,1
381,3,0
275,1,0
16,3,1
...,...,...
353,3,1
137,1,1
494,3,1
430,1,1


In [63]:
y_train

702    0
776    0
381    1
275    1
16     0
      ..
353    0
137    0
494    0
430    1
589    0
Name: Survived, Length: 712, dtype: int64

In [64]:
# instatiate the model = classifier
model = LogisticRegression()

In [65]:
# train the model
model.fit(X_train, y_train)

LogisticRegression()

In [66]:
# our coefficients = w0,w1
model.coef_, model.intercept_

(array([[-0.93971374, -2.67784038]]), array([3.19391894]))

3. Evaluate the model

In [67]:
# score on the training data
model.score(X_train, y_train)

0.797752808988764

In [68]:
# score on the testing data
model.score(X_test, y_test)

0.7430167597765364

In [75]:
# test accuracy
#ypred = model.predict(X_test) 
#accuracy_score(y_test,ypred)

0.7430167597765364

Logistic Regression with StatsModels

In [73]:
logit = sm.Logit(y, X)
f = logit.fit()
f.params
f.summary()

Optimization terminated successfully.
         Current function value: 0.552770
         Iterations 5


0,1,2,3
Dep. Variable:,Survived,No. Observations:,891.0
Model:,Logit,Df Residuals:,889.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 04 Apr 2023",Pseudo R-squ.:,0.1699
Time:,12:57:08,Log-Likelihood:,-492.52
converged:,True,LL-Null:,-593.33
Covariance Type:,nonrobust,LLR p-value:,9.258e-46

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Pclass,0.1422,0.047,3.057,0.002,0.051,0.233
Sex_category,-1.8011,0.156,-11.554,0.000,-2.107,-1.496
