# DAT 203 - Lab Assignment #4

- Author: Colin Bowers
- Date: Jun 3, 2023

## Instructions
1. Upload Titanic dataset 
1. Define Survived column as TARGET variable 
1. Select features that can be predictive of the survival status
1. Drop features that you think are not predictive and explain why they are being dropped 
1. Transform selected categorical features with Dummy values 
1. Import logistic regression function, train and test function from sklearn library 
1. Apply logistic regression on the split train/test dataset
1. Compute your model’s accuracy using accuracy_score

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk 

**1 - Upload Titanic dataset**

In [2]:
df = pd.read_csv("data/Titanic_original.csv")
df.describe(include="all").round(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.38,2.31,,,29.7,0.52,0.38,,32.2,,
std,257.35,0.49,0.84,,,14.53,1.1,0.81,,49.69,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.12,0.0,0.0,,7.91,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.45,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


**2 - Define Survived column as TARGET variable**

In [3]:
# X contains independent values, y contains dependent value
target = 'Survived'

**3 - Select features that can be predictive of the survival status**

In [4]:
predictors = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']
#predictors = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

**4 - Drop features that you think are not predictive and explain why they are being dropped**

In [5]:
# To reduce column count, let's use PassengerId as the index
df.set_index("PassengerId", inplace=True)

# Logistic Regression requires numeric variables so we should drop these
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df.drop('Cabin', axis=1, inplace=True)

# OTHER POSSIBLE FEATURES 
# Deck level could be extracted from Cabin which could be a significant predictor
# Marital status could be extracted from Name (i.e. Ms. vs Mrs.)
# Other titles and social status could be extracted from Name (e.g. Dr.)


In [6]:
# Fill missing values for Age using the mode
mode = df["Age"].mode()[0]
df.fillna({'Age': mode}, inplace=True)

In [7]:
df.describe().round(1)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.4,2.3,28.6,0.5,0.4,32.2
std,0.5,0.8,13.2,1.1,0.8,49.7
min,0.0,1.0,0.4,0.0,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0,7.9
50%,0.0,3.0,24.0,0.0,0.0,14.5
75%,1.0,3.0,35.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3


**5 - Transform selected categorical features with Dummy values**

In [8]:
dummies = ['Embarked', 'Sex']
for d in dummies:
    df[d] = df[d].astype('category')

dummy_data = pd.get_dummies(df[dummies])
df2 = pd.concat([df, dummy_data], axis=1)
df2.drop(dummies, axis=1, inplace=True)

df2.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,22.0,1,0,7.25,0,0,1,0,1
2,1,1,38.0,1,0,71.2833,1,0,0,1,0
3,1,3,26.0,0,0,7.925,0,0,1,1,0
4,1,1,35.0,1,0,53.1,0,0,1,1,0
5,0,3,35.0,0,0,8.05,0,0,1,0,1


In [9]:
predictors.remove("Embarked")
predictors.remove("Sex")
predictors += ["Embarked_C", "Embarked_Q", "Embarked_S", "Sex_female", "Sex_male" ]
predictors

['Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S',
 'Sex_female',
 'Sex_male']

**6 - Import logistic regression function, train and test function from sklearn library**

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [11]:
train, test = train_test_split(df2, test_size=0.2)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 348 to 445
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    712 non-null    int64  
 1   Pclass      712 non-null    int64  
 2   Age         712 non-null    float64
 3   SibSp       712 non-null    int64  
 4   Parch       712 non-null    int64  
 5   Fare        712 non-null    float64
 6   Embarked_C  712 non-null    uint8  
 7   Embarked_Q  712 non-null    uint8  
 8   Embarked_S  712 non-null    uint8  
 9   Sex_female  712 non-null    uint8  
 10  Sex_male    712 non-null    uint8  
dtypes: float64(2), int64(4), uint8(5)
memory usage: 42.4 KB


**7 - Apply logistic regression on the split train/test dataset**

In [12]:
X_train = train[predictors]
y_train = train[target]

# Initially, this was generating the error: "ConvergenceWarning: lbfgs failed to converge"
# Therefore, increased max_iter
model = LogisticRegression(solver='lbfgs', max_iter=500)
model.fit(X_train, y_train)

model.coef_

array([[-1.04811343, -0.0459236 , -0.38480766, -0.10019839,  0.00409089,
         0.16084438, -0.00799877, -0.14987766,  1.38567564, -1.38270769]])

In [13]:
X_test = test[predictors]
y_test = test[target]

y_pred = model.predict(X_test)
y_pred


array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1])

**8 - Compute your model’s accuracy using accuracy_score**

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
confusion_matrix(y_pred, y_test)

array([[88, 25],
       [17, 49]])

Total wrong predictions: 28 + 17 = 45

In [16]:
acc = accuracy_score(y_pred, y_test)
print(f"Accuracy: {acc*100:.2f}%")

Accuracy: 76.54%
