In [1]:
import math
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn import preprocessing

In [2]:
train_df = pd.read_csv('xy_train.csv')
X_test_df = pd.read_csv('x_test.csv')
Y_test_df = pd.read_csv('y_test.csv')

In [3]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
X_test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

So, training data has 177 missing values for Age, 687 missing values for Cabin, 2 missing values from Embarked and 
test data has 86 missing values for Age, 327 missing values for Cabin and 1 missing value for Fare.
Since a large portion of Cabin values are missing, so it can be ignored.
To handle missing values for Age, replace missing values with average Age computed from TRAINING SET.
To handle missing values for Fare, replace missing values with most frequent occuring Fare in TRAINING SET.
To handle missing values for Embarked,replace missing values with most frequent occuring Embarked in TRAINING SET.
PassengerId, Name, Ticket Number should have no role in deciding whether person survived or not, so it is ignored

In [5]:
most_frequent_fare = train_df['Fare'].mode()[0]
most_frequent_embarked = train_df['Embarked'].mode()[0]
most_frequent_cabin = train_df['Cabin'].mode()[0]
avg_age = train_df['Age'].mean()

In [6]:
X_train = train_df.drop(['PassengerId','Name','Ticket','Survived','Cabin'], axis=1)
Y_train = train_df['Survived']
X_test = X_test_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
Y_test = Y_test_df['Survived']
X_train.fillna({'Age':avg_age, 'Fare':most_frequent_fare, 'Embarked':most_frequent_embarked}, inplace=True)
X_test.fillna({'Age':avg_age, 'Fare':most_frequent_fare, 'Embarked':most_frequent_embarked}, inplace=True)

Use LabelEncoder to convert string categorical values into corresponding numerical values 

In [7]:
le = preprocessing.LabelEncoder() 
le.fit(X_train['Sex'])
X_train.loc[:,'Sex'] = le.transform(X_train['Sex'])
X_test.loc[:,'Sex'] = le.transform(X_test['Sex'])

In [8]:
le.fit(X_train['Embarked'])
X_train.loc[:,'Embarked'] = le.transform(X_train['Embarked'])
X_test.loc[:,'Embarked'] = le.transform(X_test['Embarked'])

Split test set into test set and validation set for hyperparameter tuning

In [9]:
X_test, X_val, Y_test, Y_val = train_test_split(X_test, Y_test, test_size=0.1, random_state=1)

Use MinMaxScalar for feature scaling

In [10]:
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_val = scaler.fit_transform(X_val)

Use validation set to find best value for hyperparametr C, max_iter

In [11]:
c_params = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
iters = [100, 1000, 10000]
best_c, best_iter = 0, 0
max_acc = 0
for itr in iters:
    for c in c_params:
        clf = LogisticRegression(C=c, random_state=0, max_iter=itr, solver='liblinear').fit(X_train, Y_train)
        Y_pred = clf.predict(X_val)
        acc = np.mean(Y_pred==Y_val)
        print(itr,c,acc)
        if acc > max_acc:
            max_acc = acc
            best_c = c
            best_iter = itr

100 0.0001 0.7380952380952381
100 0.001 0.7380952380952381
100 0.01 0.8095238095238095
100 0.1 0.9761904761904762
100 1 0.9523809523809523
100 10 0.9047619047619048
100 100 0.9047619047619048
1000 0.0001 0.7380952380952381
1000 0.001 0.7380952380952381
1000 0.01 0.8095238095238095
1000 0.1 0.9761904761904762
1000 1 0.9523809523809523
1000 10 0.9047619047619048
1000 100 0.9047619047619048
10000 0.0001 0.7380952380952381
10000 0.001 0.7380952380952381
10000 0.01 0.8095238095238095
10000 0.1 0.9761904761904762
10000 1 0.9523809523809523
10000 10 0.9047619047619048
10000 100 0.9047619047619048


Set hyperparameter values to the values that we got from hyperparameter tuning on validation set

In [12]:
clf = LogisticRegression(C=best_c, random_state=0, max_iter=best_iter, solver='liblinear').fit(X_train, Y_train)

In [13]:
Y_pred = clf.predict(X_test)
acc = np.mean(Y_pred==Y_test)
print(acc)

0.9973404255319149
