In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree,svm
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [3]:
train_data = pd.read_csv("/content/train.csv")
test_data = pd.read_csv("/content/test.csv")

In [4]:
train_data.head()
train_data.info()
train_data.describe()
train_data['Survived'].value_counts()
train_data['Pclass'].value_counts()
train_data['Sex'].value_counts()
train_data['Embarked'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

# **DATA PREPROCESSING**

In [5]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

We can see there are 177 missing entries in Age column. 687 missing entries are in Cabin column and 2 missing are in Embarked.

**HANDLE MISSING VALUES OF AGE COLUMN**

In [6]:
mean = train_data["Age"].mean()
std = train_data["Age"].std()

rand_age = np.random.randint(mean-std, mean+std, size = 177)
age_slice = train_data["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
train_data["Age"] = age_slice

train_data["Embarked"].fillna(value="C", inplace=True)
train_data.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

We can see, we don't have missing values

**DROP COLUMN**

In [7]:
col_to_drop = ["PassengerId", "Ticket", "Cabin", "Name"]
train_data.drop(col_to_drop, axis=1, inplace=True)
train_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S
5,0,3,male,21.0,0,0,8.4583,Q
6,0,1,male,54.0,0,0,51.8625,S
7,0,3,male,2.0,3,1,21.075,S
8,1,3,female,27.0,0,2,11.1333,S
9,1,2,female,14.0,1,0,30.0708,C


**CONVERTING CATEGORICAL VARIABLES TO NUMERIC**

In [8]:
genders = {"male":0, "female":1}
train_data["Sex"] = train_data["Sex"].map(genders)

ports = {"S":0, "C":1, "Q":2}
train_data["Embarked"] = train_data["Embarked"].map(ports)

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.925,0
3,1,1,1,35.0,1,0,53.1,0
4,0,3,0,35.0,0,0,8.05,0


# **BUILDING MACHINE LEARNING MODEL**

So, this was all about data preprocessing. Now we are good to go with our titanic dataset. Let’s quickly train our machine learning model.

In [13]:
df_train_x = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

df_train_y = train_data['Survived']

x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.20, random_state=42)

**Lastly, We are going to fit our model on 5 different classification algorithms namely RANDOM FOREST CLASSIFIER, LOGISTIC REGRESSION, K-NEIGHBOR CLASSIFIER, DECISSION TREE CLASSIFIER, and SUPPORT VECTOR MACHINE. And eventually will compare them.**

**RANDOM FOREST**

In [23]:
model1 = RandomForestClassifier()

model1 = model1.fit(x_train, y_train)
rfc_y_pred = model1.predict(x_test)
rfc_accuracy = accuracy_score(y_test,rfc_y_pred) * 100
print("accuracy=",rfc_accuracy)
print(classification_report(y_test, rfc_y_pred))

accuracy= 82.12290502793296
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       105
           1       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



**LOGISTIC REGRESSION**

In [24]:
model2 = LogisticRegression( max_iter=2000 )
model2 = model2.fit(x_train, y_train)
lr_y_pred = model2.predict(x_test)
lr_accuracy = accuracy_score(y_test,lr_y_pred)*100
print("accuracy=",lr_accuracy)
print(classification_report(y_test, lr_y_pred))

accuracy= 80.44692737430168
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



**DECISSION TREE CLASSIFIER**

In [25]:
model4 = tree.DecisionTreeClassifier()
model4 = model4.fit(x_train, y_train)
dtc_y_pred = model4.predict(x_test)
dtc_accuracy = accuracy_score(y_test,dtc_y_pred)*100
print("accuracy=",dtc_accuracy)
print(classification_report(y_test, dtc_y_pred))

accuracy= 78.2122905027933
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       105
           1       0.73      0.76      0.74        74

    accuracy                           0.78       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.78      0.78      0.78       179



**SUPPORT VECTOR MACHINE**

In [26]:
model5 = svm.SVC()
model5 = model5.fit(x_train, y_train)
svm_y_pred = model5.predict(x_test)
svm_accuracy = accuracy_score(y_test,svm_y_pred)*100
print("accuracy=",svm_accuracy)
print(classification_report(y_test, svm_y_pred))

accuracy= 65.36312849162012
              precision    recall  f1-score   support

           0       0.64      0.94      0.76       105
           1       0.75      0.24      0.37        74

    accuracy                           0.65       179
   macro avg       0.69      0.59      0.56       179
weighted avg       0.68      0.65      0.60       179



**ACCURACY SCORES OF All CLASSIFIERS**

In [27]:
print("Accuracy of RANDOM FOREST CLASSIFIER =",rfc_accuracy)
print("Accuracy of LOGISTIC REGRESSION =",lr_accuracy)
print("Accuracy of DECISION TREE CLASSIFIER = ",dtc_accuracy)
print("Accuracy of SUPPORT VECTOR MACHINE = ",svm_accuracy)

Accuracy of RANDOM FOREST CLASSIFIER = 82.12290502793296
Accuracy of LOGISTIC REGRESSION = 80.44692737430168
Accuracy of DECISION TREE CLASSIFIER =  78.2122905027933
Accuracy of SUPPORT VECTOR MACHINE =  65.36312849162012


The most suitable
algorithm for the given task is given by RANDOM FOREST CLASSIFIER with the accuracy 82.12

Subsequently, we can now rank our evaluation of all the models to choose the best one for our problem. While Random Forest and Logistic Regression score almost the same.