In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv("data/train.csv")
# X = dataset.iloc[:, np.r_[2:dataset.shape[1]]].values
# Y = dataset.iloc[:, 1].values
df = df.loc[df.Embarked.notna(), ['Survived', 'Pclass', 'Sex', 'Age', 'Embarked']]

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
0,0,3,male,22.0,S
1,1,1,female,38.0,C
2,1,3,female,26.0,S
3,1,1,female,35.0,S
4,0,3,male,35.0,S


In [4]:
df.loc[df.Age.isna(),:]

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked
5,0,3,male,,Q
17,1,2,male,,S
19,1,3,female,,C
26,0,3,male,,C
28,1,3,female,,Q
...,...,...,...,...,...
859,0,3,male,,C
863,0,3,female,,S
868,0,3,male,,S
878,0,3,male,,S


In [5]:
# Fill empty ages with mean
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

# imputed_age = imputer.fit_transform(np.array(df.loc[:,['Age']]))
# df.Age = imputed_age
train_age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(train_age_mean)

In [6]:
df.iloc[5,:]

Survived            0
Pclass              3
Sex              male
Age         29.642093
Embarked            Q
Name: 5, dtype: object

In [7]:
df.loc[df.Age.isna(),:]

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked


In [8]:
Xdf = df.drop('Survived', axis='columns')
ydf = df.loc[:, ['Survived']]

In [9]:
Xdf.head()

Unnamed: 0,Pclass,Sex,Age,Embarked
0,3,male,22.0,S
1,1,female,38.0,C
2,3,female,26.0,S
3,1,female,35.0,S
4,3,male,35.0,S


In [10]:
ydf.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


## Dummy variable generation

In [11]:
# Convert Gender into categorical hotencoded variables
# If categorical variables are converted into scalar values, the model will misinterpretate them.
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
column_trans = make_column_transformer(
    (OneHotEncoder(), ['Sex', 'Embarked']),
    remainder='passthrough'
)
X = column_trans.fit_transform(Xdf)
# Returns a numpy array

In [12]:
y = ydf.iloc[:, 0].values

In [13]:
print(X)

[[ 0.         1.         0.        ...  1.         3.        22.       ]
 [ 1.         0.         1.        ...  0.         1.        38.       ]
 [ 1.         0.         0.        ...  1.         3.        26.       ]
 ...
 [ 1.         0.         0.        ...  1.         3.        29.6420927]
 [ 0.         1.         1.        ...  0.         1.        26.       ]
 [ 0.         1.         0.        ...  0.         3.        32.       ]]


In [14]:
print(y)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0 0
 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0
 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0
 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1 1
 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1
 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0
 0 1 1 0 1 0 0 1 0 0 0 0 

## Split dataset

In [15]:
# Data split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

## Scale data

In [17]:
# Feature scaling for Age feature
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 6:7] = sc.fit_transform(X_train[:, 6:7])
X_test[:, 6:7] = sc.transform(X_test[:, 6:7]) # Use transform to use X_train fitted scaler

## Logistic regression

In [18]:
# Model training
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [19]:
# Predicting
y_pred = classifier.predict(X_test)

# Print predicted versus actual
print(np.concatenate(
    (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)),
    1
))

[[1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [0 1]
 [0 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]

In [20]:
# Confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[190  28]
 [ 35 103]]


0.8230337078651685

## Apply to test data

In [31]:
# Use model with test data against 
dfTest = pd.read_csv("data/test.csv")
dfTest = dfTest.loc[:,['Pclass', 'Sex', 'Age', 'Embarked']]
dfTest.head()

Unnamed: 0,Pclass,Sex,Age,Embarked
0,3,male,34.5,Q
1,3,female,47.0,S
2,2,male,62.0,Q
3,3,male,27.0,S
4,3,female,22.0,S


In [32]:
dfTest['Age'] = dfTest['Age'].fillna(train_age_mean)
dfTest.loc[dfTest.Age.isna(),:]

Unnamed: 0,Pclass,Sex,Age,Embarked


In [33]:
column_trans = make_column_transformer(
    (OneHotEncoder(), ['Sex', 'Embarked']),
    remainder='passthrough'
)
X_final = column_trans.fit_transform(dfTest)
X_final[:, 6:7] = sc.fit_transform(X_final[:, 6:7])

In [34]:
print(X_final.shape)

(418, 7)


In [35]:
# Predicting
predictions = classifier.predict(X_final)

In [37]:
print(len(predictions[predictions == 1]))

155


## Export

In [38]:
output = pd.read_csv("data/test.csv")
output = output.loc[:, ["PassengerId"]]
output['Survived'] = predictions
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [41]:
output.to_csv('submission.csv', index=False)