Coding a Predict model to predict whether the passanger of the titanic will survive or not

In [None]:
# Installing necessary libraries
!pip install -U scikit-learn

In [63]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [64]:
# Reading train dataset and taking a first look at it
dataset = pd.read_csv(r'train.csv')

dataset.info() #Columns with null values: age, cabin, embarked
# dataset.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# Taking a look at the first lines of the dataset
# According to Kaggle, the column Survived is the one that we gonna use to train our predict model
dataset.head()

In [66]:
# Filling the NaN values with median of the own column
# Filling Cabin and Embarked NaN values with a new category: Unknown
dataset['Age'] = dataset['Age'].fillna(dataset['Age'].median())
dataset['Cabin'] = dataset['Cabin'].fillna('Unknown')
dataset['Embarked'] = dataset['Embarked'].fillna('Unknown')

In [67]:
# Since modeling works only with numbers, we will transform non numeric columns into numeric ones, using LabelEncoder.
encoder = LabelEncoder()

dataset['Name'] = encoder.fit_transform(dataset['Name'])
dataset['Sex'] = encoder.fit_transform(dataset['Sex'])
dataset['Ticket'] = encoder.fit_transform(dataset['Ticket'])
dataset['Cabin'] = encoder.fit_transform(dataset['Cabin'])
dataset['Embarked'] = encoder.fit_transform(dataset['Embarked'])

In [68]:
# Removing columns that will not be used for model training, those will be stored as X and y, y beeing the target values
X = dataset.drop(['Survived', 'PassengerId'], axis = 1).values #Colum ID will be removed as well, because ir can interfere with model training
y = dataset['Survived'].values

# Separating dataset into train and test variables. Since it is a small amount of data, we will consider 30% as test size.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Creating an instance of the predicting model. We will do a comparission of three models: LogisticRegression, RandomForest and KNeighborsClassifier. It will be compared using the Accuracy Score

LogisticRegression:

In [None]:
# Creating instance of LogisticRegression
logr = LogisticRegression()

# Fitting our model using the train variables
logr.fit(X_train, y_train)

# Predicting our data from the test variables
logr_prediction = logr.predict(X_test)

RandomForest:

In [70]:
# Creating instance of RandomForestClassifier
forest = RandomForestClassifier()

# Fitting our model using the train variables
forest.fit(X_train, y_train)

# Predicting our data from the test variables
forest_prediction = forest.predict(X_test)

KNeighborsClassifier:

In [77]:
# Getting the best number of neighbors:
from sklearn.model_selection import cross_val_score

k_values = range(1, 21)  # Testar n_neighbors de 1 a 20

# List to store results
scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    # Cross validation with 5 divisions
    score = cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()
    scores.append(score)

# Find the k value with best performance
best_k = k_values[np.argmax(scores)]
print(f"Best value for n_neighbors: {best_k}") # 6

Best value for n_neighbors: 6


In [78]:
# Creating instance of KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6)

# Fitting our model using the train variables
knn.fit(X_train, y_train)

# Predicting our data from the test variables
knn_prediction = knn.predict(X_test)

What was the best model?

In [79]:
# Printing the Accuracy of each model.
logr_accuracy = accuracy_score(y_test, logr_prediction)
forest_accuracy = accuracy_score(y_test, forest_prediction)
knn_accuracy = accuracy_score(y_test, knn_prediction)

print(f'Accuracy for LogisticRegression: {format(np.round(logr_accuracy * 100, 2))}%')
print(f'Accuracy for RandomForest: {format(np.round(forest_accuracy * 100, 2))}%')
print(f'Accuracy for KNeighbors: {format(np.round(knn_accuracy * 100, 2))}%')

Accuracy for LogisticRegression: 81.34%
Accuracy for RandomForest: 85.07%
Accuracy for KNeighbors: 63.06%


We can see that RandomForest was the best Supervised Model for this dataset.
Now lets use the trained model in the test dataset to see whether each passanger will survive.

In [105]:
# Importing test dataset
testdf = pd.read_csv(r'test.csv')

testdf.info() #Columns with null values: age, fare, cabin, embarked

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [106]:
# Removing the ID Column
testdf.drop(columns=['PassengerId'], inplace=True)

# Filling the NaN values with median of the own column
# Filling Cabin and Embarked NaN values with a new category: Unknown
testdf['Age'] = testdf['Age'].fillna(testdf['Age'].median())
testdf['Fare'] = testdf['Fare'].fillna(testdf['Fare'].median())
testdf['Cabin'] = testdf['Cabin'].fillna('Unknown')
testdf['Embarked'] = testdf['Age'].fillna('Unknown')

In [107]:
# Encoding
testdf['Name'] = encoder.fit_transform(testdf['Name'])
testdf['Sex'] = encoder.fit_transform(testdf['Sex'])
testdf['Ticket'] = encoder.fit_transform(testdf['Ticket'])
testdf['Cabin'] = encoder.fit_transform(testdf['Cabin'])
testdf['Embarked'] = encoder.fit_transform(testdf['Embarked'])

In [None]:
# Predicting with RandomForest previously trained
predict = forest.predict(testdf)

In [111]:
# Add the prediction column into the test dataset
testdf['Prediction'] = predict
testdf

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Prediction
0,3,206,1,34.5,0,0,152,7.8292,76,44,0
1,3,403,0,47.0,1,0,221,7.0000,76,60,1
2,2,269,1,62.0,0,0,73,9.6875,76,74,0
3,3,408,1,27.0,0,0,147,8.6625,76,34,0
4,3,178,0,22.0,1,1,138,12.2875,76,27,1
...,...,...,...,...,...,...,...,...,...,...,...
413,3,353,1,27.0,0,0,267,8.0500,76,34,0
414,1,283,0,39.0,0,0,324,108.9000,22,51,1
415,3,332,1,38.5,0,0,346,7.2500,76,50,0
416,3,384,1,27.0,0,0,220,8.0500,76,34,0
