Build your Logistic Regression model

Import the libraries

In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

Importing the dataset

In [22]:
train_dataset = pd.read_csv("titanic/train.csv")
test_dataset = pd.read_csv("titanic/test.csv")
test_labels = pd.read_csv("titanic/gender_submission.csv")

In [23]:
print(train_dataset.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [24]:
print(test_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None


In [25]:
print(train_dataset.describe())


       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [26]:
print(f"the null values in the train dataset are \n{train_dataset.isnull().sum()}\n")
print(f"the null values in the test dataset are \n{test_dataset.isnull().sum()}")

the null values in the train dataset are 
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

the null values in the test dataset are 
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


Feature Engineering

Drop out PassengerID, Name, Ticket, Fare and Cabin column for both datasets

In [27]:
#axis = 1 means that we are dropping the column
train_dataset = train_dataset.drop(['PassengerId','Name','Ticket','Fare','Cabin'], axis = 1)
test_dataset = test_dataset.drop(['PassengerId','Name','Ticket','Fare','Cabin'], axis = 1)

Drop Null Embarked column record for training dataset

In [28]:
# drop the rows that embarked column have null values for train_dataset
train_dataset = train_dataset.dropna(subset=['Embarked'], axis = 0)

Now fill up the age values with mean values for both datasets

In [29]:
#fill up the age values with mean values for both datasets
train_dataset['Age'].fillna(value = round(np.mean(train_dataset['Age'])), inplace = True)
test_dataset['Age'].fillna(value = round(np.mean(test_dataset['Age'])), inplace = True)

Check the sex column and update it into numerical column (Male: 0, Female: 1)

In [30]:
train_dataset['Sex'] = train_dataset['Sex'].map({
    'male': 0,
    'female':1
})

test_dataset['Sex'] = test_dataset['Sex'].map({
    'male': 0,
    'female':1
})

Check the embarked column and update it into numerical column (C:0, Q:1, S:1)

In [31]:
train_dataset['Embarked'] = train_dataset['Embarked'].map({
    'C': 0,
    'Q':1,
    'S':2
})

test_dataset['Embarked'] = test_dataset['Embarked'].map({
    'C': 0,
    'Q':1,
    'S':2
})

Get the  Survived column as target value

In [32]:
#get the Survived column as target value for prediction
train_labels = train_dataset["Survived"]
train_dataset = train_dataset.drop(["Survived"], axis=1)
test_labels = test_labels["Survived"]

In [33]:
print(test_dataset.info())
print(test_labels.info())
print(train_dataset.info())
print(train_labels.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    int64  
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Embarked  418 non-null    int64  
dtypes: float64(1), int64(5)
memory usage: 19.7 KB
None
<class 'pandas.core.series.Series'>
RangeIndex: 418 entries, 0 to 417
Series name: Survived
Non-Null Count  Dtype
--------------  -----
418 non-null    int64
dtypes: int64(1)
memory usage: 3.4 KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    889 non-null    int64  
 1   Sex       889 non-null    int64  
 2   Age       889 non-null    float64
 3   SibSp     889 non-null 

In [34]:
# convert df to numpy array
test_dataset = test_dataset.values
test_labels = test_labels.values
train_dataset = train_dataset.values
train_labels = train_labels.values

Normalize the data

In [35]:
import pickle
scaler = StandardScaler()
scaler.fit(train_dataset)
# save the scaler object to a file
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [36]:
train_dataset = scaler.transform(train_dataset)
test_dataset = scaler.transform(test_dataset)

Train the model

In [37]:
model = LogisticRegression()
model.fit(train_dataset, train_labels)

Check the score on the test data

In [38]:
test_score =model.score(test_dataset, test_labels)
print(test_score)

0.9473684210526315


In [39]:
# Analyze the coefficients
# Coefficients are the weights of the features
# The higher the coefficient, the higher the importance of the feature
# features are Pclass, Sex, Age, SibSp, Parch, Embarked
print(list(zip(['Pclass','Sex','Age','Sibsp', 'Parch', 'Embarked'],model.coef_[0])))

[('Pclass', -0.935686790694793), ('Sex', 1.290373858337269), ('Age', -0.5064494897932916), ('Sibsp', -0.33928281388782966), ('Parch', -0.061948703694327284), ('Embarked', -0.19287003173801343)]


In [40]:
# save the model using pickle
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

Test out the pickled_model

In [41]:
pickled_model = pickle.load(open('model.pkl', 'rb'))
test_score =pickled_model.score(test_dataset, test_labels)
print(test_score)


0.9473684210526315
