## Part 2: Question 1

In [1]:
import csv
import pandas as pd
import numpy as np

### Load test and train data

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
train_data.shape

(891, 12)

In [4]:
test_data.shape

(418, 11)

In [5]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

### Preprocessing Data
Drop irrelevant columns
- name
- passenger ID
- ticket
- cabin
Data imputation for missing values
One hot encodding
Mapping sex to number 1 for Male 0 for Female

In [6]:
# Drop irrelevant features in train and test data
train_data.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis = 1, inplace = True)
test_data.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis = 1, inplace = True)

In [7]:
train_data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [8]:
# Fill missing values in train set with mean and mode

train_data['Age'].fillna(np.mean(train_data['Age']), inplace = True)
train_data['Embarked'].fillna(value = train_data['Embarked'].mode()[0], inplace = True)

In [9]:
test_data.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [10]:
test_data['Age'].fillna(np.mean(test_data['Age']), inplace = True)
test_data['Fare'].fillna(np.mean(test_data['Fare']), inplace = True)

In [11]:
# Mapping sex values [0,1]
train_data['Sex'].replace({'female': 1, 'male': 0}, inplace=True)
test_data['Sex'].replace({'female': 1, 'male': 0}, inplace=True)

In [12]:
#One hot encode embarked feature
train_embarked_encoded = pd.get_dummies(train_data['Embarked'], prefix='Embarked', dtype=int)
test_embarked_encoded = pd.get_dummies(test_data['Embarked'], prefix='Embarked', dtype=int)

In [13]:
train_data = pd.concat([train_data, train_embarked_encoded], axis=1)

In [14]:
train_data.drop('Embarked', axis=1, inplace=True)
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,22.000000,1,0,7.2500,0,0,1
1,1,1,1,38.000000,1,0,71.2833,1,0,0
2,1,3,1,26.000000,0,0,7.9250,0,0,1
3,1,1,1,35.000000,1,0,53.1000,0,0,1
4,0,3,0,35.000000,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,0,0,1
887,1,1,1,19.000000,0,0,30.0000,0,0,1
888,0,3,1,29.699118,1,2,23.4500,0,0,1
889,1,1,0,26.000000,0,0,30.0000,1,0,0


In [15]:
test_data = pd.concat([test_data, test_embarked_encoded], axis=1)
test_data.drop('Embarked', axis=1, inplace=True)

In [16]:
test_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,34.50000,0,0,7.8292,0,1,0
1,3,1,47.00000,1,0,7.0000,0,0,1
2,2,0,62.00000,0,0,9.6875,0,1,0
3,3,0,27.00000,0,0,8.6625,0,0,1
4,3,1,22.00000,1,1,12.2875,0,0,1
...,...,...,...,...,...,...,...,...,...
413,3,0,30.27259,0,0,8.0500,0,0,1
414,1,1,39.00000,0,0,108.9000,1,0,0
415,3,0,38.50000,0,0,7.2500,0,0,1
416,3,0,30.27259,0,0,8.0500,0,0,1


## Part 2: Question 2 and 3
### Import LogisticRegression model from Sklearn

In [17]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state = 16)


### Seperate training set into features and target values

In [18]:
y_train = train_data.Survived
X_train = train_data.drop('Survived', axis = 1)

#### https://www.datacamp.com/tutorial/understanding-logistic-regression-python

In [25]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
logreg.fit(X_train_scaled, y_train)

LogisticRegression(random_state=16)

#### Note: ChatGPT recommended to change dataframe into numpy array for computation

In [20]:
y_pred = logreg.predict(np.array(test_data))

In [21]:
y_pred_df = pd.DataFrame(y_pred, columns=['Predicted Survivers'])

In [22]:
y_pred_df.

Unnamed: 0,Predicted Survivers
count,418.0
mean,0.038278
std,0.192095
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


### Prediction - approximately 4% of the 418 passengers would survive
#### The model works