In [1]:
import pandas as pd
import numpy as np

In [2]:
# file path
path_train = './Data/train.csv'
path_test = './Data/test.csv'

# laoding CSV
data = pd.read_csv(path_train)
data_test = pd.read_csv(path_test)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [4]:
# changing to numerical format

# backup to keep data files unchanged
X = data.copy() # creating a copy
X_test = data_test.copy()

# data types correction
X.Sex = X.Sex.map({'male': 1, 'female': 0}).astype('float64') # gender formatting to 0/1
X_test.Sex = X_test.Sex.map({'male': 1, 'female': 0}).astype('float64')

# splitting to dependent and independent features
y = X.Survived.values # target vector / dependent feature
X = X.loc[:, ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']].values # independent features
X_test = X_test.loc[:, ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']].values

# taking care of missing values in Embarked and Fare column
mostfrq_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X[:, [-1]] = mostfrq_imputer.fit_transform(X[:, [-1]]) # fitting most frequent label for nulls in Embarked column
X_test[:, [-1]] = mostfrq_imputer.transform(X_test[:, [-1]])
X[:, [-2]] = median_imputer.fit_transform(X[:, [-2]]) # fitting median value for nulls in Fare column
X_test[:, [-2]] = median_imputer.transform(X_test[:, [-2]])

# OneHot encoding for categorical features
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, -1])], remainder='passthrough') #encoding Pclass and Embarked to OneHot format
X = np.array(ct.fit_transform(X)) # applying to train data
X_test = np.array(ct.transform(X_test)) # applying the same to the test data

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

In [29]:
# spliting data for train and validation
test_val_ratio = 0.2
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = test_val_ratio)

# feature scaling and normalizing
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) # OneHot columns are unchanged
X_val[:, 3:] = sc.transform(X_val[:, 3:])

In [30]:
# classifier definition
modedl_lr = LogisticRegression() # defining a Logistic Regressor

#fitting on train data
modedl_lr.fit(X_train, y_train)

# predicting the validation data to estimate the out of sample error
pred = modedl_lr.predict(X_val)
print(round(accuracy_score(y_val, pred) * 100, 2), '%')

# confusion matrix 
cm = pd.DataFrame(confusion_matrix(y_val, pred), index = ['Really Died', 'Really Survived'], columns = ['Died', 'Survived'])
print(cm)
print('*****')
print('Survival Rate', y.mean())
print('Prediction Rate', pred.mean())

74.86 %
                 Died  Survived
Really Died        90        18
Really Survived    27        44
*****
Survival Rate 0.3838383838383838
Prediction Rate 0.3463687150837989


In [31]:
#feature scaling and normalizing
sc = StandardScaler()
X[:, 3:] = sc.fit_transform(X[:, 3:]) # OneHot columns are unchanged
X_test[:, 3:] = sc.transform(X_test[:, 3:])

# classifier definition
best_model = LogisticRegression()

#fitting on full data
best_model.fit(X, y)

# predicting the full data to check its result
pred = best_model.predict(X)
print(round(accuracy_score(y, pred) * 100, 2), '%')

# confusion matrix 
cm = pd.DataFrame(confusion_matrix(y, pred), index = ['Really Died', 'Really Survived'], columns = ['Died', 'Survived'])
print(cm)
print('*****')
print('Survival Rate', y.mean())
print('Prediction Rate', pred.mean())

80.02 %
                 Died  Survived
Really Died       480        69
Really Survived   109       233
*****
Survival Rate 0.3838383838383838
Prediction Rate 0.3389450056116723
