# Logistic Regression on Titanic Dataset

__Predict the survival of passengers travelling in RMS Titanic using Logistic Regression.__

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
df = pd.read_csv("train.csv")

Data Dictionary<br>
- Variable	Definition	Key
- survival	Survival	0 = No, 1 = Yes
- pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
- sex	Sex	
- Age	Age in years	
- sibsp	# of siblings / spouses aboard the Titanic	
- parch	# of parents / children aboard the Titanic	
- ticket	Ticket number	
- fare	Passenger fare	
- cabin	Cabin number	
- embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

In [17]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
df.shape

(891, 12)

In [19]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [20]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [21]:
df['Embarked'].mode()

0    S
dtype: object

In [22]:
df['Embarked'].mode()[0]

'S'

In [23]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

In [24]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [25]:
df['Age'].fillna(df['Age'].median(), inplace=True)

In [26]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [27]:
df.drop('Cabin', axis=1, inplace=True)

In [28]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [29]:
df['SibSp'].value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [30]:
df['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [31]:
#Derived Feature --- Feature Enginnering
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [32]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,1


In [33]:
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [34]:
df['GenderClass'] = df.apply(lambda x : 'child' if x['Age']<15 else x['Sex'], axis=1)

In [40]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,GenderClass
147,148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9.0,2,2,W./C. 6608,34.375,S,5,child
666,667,0,2,"Butler, Mr. Reginald Fenton",male,25.0,0,0,234686,13.0,S,1,male
646,647,0,3,"Cor, Mr. Liudevit",male,19.0,0,0,349231,7.8958,S,1,male
434,435,0,1,"Silvey, Mr. William Baird",male,50.0,1,0,13507,55.9,S,2,male
723,724,0,2,"Hodges, Mr. Henry Price",male,50.0,0,0,250643,13.0,S,1,male


In [41]:
df['GenderClass'].value_counts()

male      538
female    275
child      78
Name: GenderClass, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
 11  FamilySize   891 non-null    int64  
 12  GenderClass  891 non-null    object 
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


In [None]:
df = pd.get_dummies(df, columns=['GenderClass','Embarked'], drop_first=True)

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,FamilySize,GenderClass_female,GenderClass_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,2,0,1,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,2,1,0,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,1,1,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,2,1,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,1,0,1,0,1


In [None]:
df = df.drop(['Name','Ticket','Sex','SibSp','Parch'], axis=1)

In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,FamilySize,GenderClass_female,GenderClass_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,7.25,2,0,1,0,1
1,2,1,1,38.0,71.2833,2,1,0,0,0
2,3,1,3,26.0,7.925,1,1,0,0,1
3,4,1,1,35.0,53.1,2,1,0,0,1
4,5,0,3,35.0,8.05,1,0,1,0,1


In [None]:
# Pairplot

In [None]:
# Correlation Matrix and Plot a Heatmap and write your insights

In [None]:
X = df.loc[:,df.columns != 'Survived']
X.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,FamilySize,GenderClass_female,GenderClass_male,Embarked_Q,Embarked_S
0,1,3,22.0,7.25,2,0,1,0,1
1,2,1,38.0,71.2833,2,1,0,0,0
2,3,3,26.0,7.925,1,1,0,0,1
3,4,1,35.0,53.1,2,1,0,0,1
4,5,3,35.0,8.05,1,0,1,0,1


In [None]:
y = df['Survived']

In [None]:
X.shape

(891, 9)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
X_train.shape

(712, 9)

In [None]:
X_test.shape

(179, 9)

In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train) # study

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
# Model Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
accuracy_score(y_test, y_pred)

0.7988826815642458

In [None]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.79      0.90      0.84       106
          1       0.81      0.66      0.73        73

avg / total       0.80      0.80      0.79       179



In [None]:
confusion_matrix(y_test, y_pred)

array([[95, 11],
       [25, 48]], dtype=int64)

In [None]:
cm = pd.DataFrame(confusion_matrix(y_test, y_pred))

In [None]:
cm

Unnamed: 0,0,1
0,95,11
1,25,48


In [None]:
cm.index = ['Actual Died','Actual Survived']
cm.columns = ['Predicted Died', 'Predicted Survived']

In [None]:
cm

Unnamed: 0,Predicted Died,Predicted Survived
Actual Died,95,11
Actual Survived,25,48


In [None]:
lr.predict(X_test)

array([1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0], dtype=int64)

In [None]:
lr.predict_proba(X_test)

array([[0.073178  , 0.926822  ],
       [0.92857639, 0.07142361],
       [0.18493712, 0.81506288],
       [0.26225669, 0.73774331],
       [0.28946919, 0.71053081],
       [0.89869459, 0.10130541],
       [0.91095312, 0.08904688],
       [0.12371292, 0.87628708],
       [0.65370722, 0.34629278],
       [0.38171457, 0.61828543],
       [0.89271503, 0.10728497],
       [0.40056951, 0.59943049],
       [0.75754233, 0.24245767],
       [0.85888766, 0.14111234],
       [0.30742602, 0.69257398],
       [0.63981521, 0.36018479],
       [0.65424037, 0.34575963],
       [0.91519934, 0.08480066],
       [0.93720498, 0.06279502],
       [0.42427739, 0.57572261],
       [0.92480729, 0.07519271],
       [0.93262391, 0.06737609],
       [0.04182886, 0.95817114],
       [0.75982981, 0.24017019],
       [0.47714872, 0.52285128],
       [0.89330957, 0.10669043],
       [0.04773635, 0.95226365],
       [0.15585682, 0.84414318],
       [0.7961791 , 0.2038209 ],
       [0.03477973, 0.96522027],
       [0.

In [None]:
lr.predict_proba(X_test)[:,1]

array([0.926822  , 0.07142361, 0.81506288, 0.73774331, 0.71053081,
       0.10130541, 0.08904688, 0.87628708, 0.34629278, 0.61828543,
       0.10728497, 0.59943049, 0.24245767, 0.14111234, 0.69257398,
       0.36018479, 0.34575963, 0.08480066, 0.06279502, 0.57572261,
       0.07519271, 0.06737609, 0.95817114, 0.24017019, 0.52285128,
       0.10669043, 0.95226365, 0.84414318, 0.2038209 , 0.96522027,
       0.74889562, 0.34267391, 0.2911581 , 0.65896132, 0.08701538,
       0.22469732, 0.34712876, 0.06810233, 0.38545741, 0.10372137,
       0.63182906, 0.46545771, 0.89335357, 0.29199234, 0.81675074,
       0.07113787, 0.16046281, 0.19575896, 0.86110566, 0.25669196,
       0.19439179, 0.80320936, 0.17768777, 0.14739118, 0.06458911,
       0.57226307, 0.15819297, 0.29257853, 0.01204107, 0.68475748,
       0.12268309, 0.13488657, 0.19654726, 0.27121249, 0.37414759,
       0.57070153, 0.08156073, 0.09599242, 0.51485391, 0.09115716,
       0.06045291, 0.32308222, 0.47545958, 0.09971582, 0.80211

In [None]:
lr.predict_proba(X_test)[:,1] > 0.75

array([ True, False,  True, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False,  True,
        True, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False,  True,
       False, False, False,  True, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False,  True, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False,  True, False, False, False,  True,  True,
        True, False,

In [None]:
preds1 = np.where(lr.predict_proba(X_test)[:,1] > 0.75, 1, 0)

In [None]:
accuracy_score(y_test,preds1)

0.7374301675977654

In [None]:
preds2 = np.where(lr.predict_proba(X_test)[:,1] > 0.25, 1, 0)

In [None]:
accuracy_score(y_test,preds2)

0.7486033519553073

# Multiclass Logistic Regression

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
iris_sns = sns.load_dataset('iris')
iris_sns.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
iris_sns.shape

(150, 5)

In [None]:
iris_sns['species'].value_counts()

versicolor    50
setosa        50
virginica     50
Name: species, dtype: int64

In [None]:
from sklearn import datasets

In [None]:
iris = datasets.load_iris()

In [None]:
X = iris.data

In [None]:
y = iris.target

In [None]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [None]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
print(iris.DESCR)

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
# Create One vs All Logistic Regression

clf = LogisticRegression(random_state=0, multi_class='ovr')

In [None]:
model = clf.fit(X_std, y)

In [None]:
# Unseen data
unseen = [[0.5,0.5,0.5,0.5]]

In [None]:
model.predict(unseen) # Virginica

array([2])

In [None]:
model.predict_proba(unseen)

array([[0.0387617 , 0.40669108, 0.55454723]])

In [None]:
from sklearn.model_selection import GridSearchCV,cross_val_score

#parameters = {'bootstrap': [True],
# 'max_depth': [10, 20, 30, 40, 50],
# 'n_estimators': [100,200,300,400,500,600]}
parameters = {'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}

clf = GridSearchCV(LogisticRegression(), parameters,cv=5, verbose = 2)
clf.fit(X_std, y)

clf.best_params_

Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] C=0.001, penalty=l1 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] .............................. C=0.001, penalty=l1, total=   0.0s
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l2, total=   0.0s
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l2, total=   0.0s
[CV] C=0.001, penalty=l2 .............................................
[CV] .............................. C=0.001, penalty=l2, total=   0.0s
[CV] C=0.01, penalty=l1 ..............................................
[CV] ...........

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  42 out of  42 | elapsed:    0.0s finished


{'C': 10, 'penalty': 'l1'}

In [None]:
!pip install eli5

Collecting eli5
  Using cached eli5-0.11.0-py2.py3-none-any.whl (106 kB)
Installing collected packages: eli5
Successfully installed eli5-0.11.0


ERROR: Error checking for conflicts.
Traceback (most recent call last):
  File "c:\users\akash\appdata\local\programs\python\python36\lib\site-packages\pip\_vendor\pkg_resources\__init__.py", line 3021, in _dep_map
    return self.__dep_map
  File "c:\users\akash\appdata\local\programs\python\python36\lib\site-packages\pip\_vendor\pkg_resources\__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\users\akash\appdata\local\programs\python\python36\lib\site-packages\pip\_vendor\pkg_resources\__init__.py", line 3012, in _parsed_pkg_info
    return self._pkg_info
  File "c:\users\akash\appdata\local\programs\python\python36\lib\site-packages\pip\_vendor\pkg_resources\__init__.py", line 2815, in __getattr__
    raise AttributeError(attr)
AttributeError: _pkg_info

During handling of the above exception, another e