# Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Loading data

In [2]:
train_data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")
gender_submission = pd.read_csv("Data/gender_submission.csv")

# Data preprocessing and cleaning

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data["Sex"], test_data["Sex"] = train_data["Sex"].map({"male": 0, "female": 1}), test_data["Sex"].map({"male": 0, "female": 1})

In [5]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [6]:
features = ["Pclass", "Sex", "SibSp", "Parch"]

X_train = train_data[features]
X_test = test_data[features]

y_train = train_data["Survived"]
y_test = np.array(gender_submission["Survived"])

# Fitting different models to our data to check the best accuracy

# 1- Logistic Regression

In [7]:
lr = LogisticRegression(random_state = 42)

lr.fit(X_train, y_train)
y_preds = lr.predict(X_test)

f"The accuracy of the model with logistic regression is {round((accuracy_score(y_test, y_preds) * 100), 2)}%"

'The accuracy of the model with logistic regression is 99.28%'

# 2 - Decision Tree Classifier

dtc = DecisionTreeClassifier()

dtc.fit(X_train, y_train)
y_preds = lr.predict(X_test)

f"The accuracy of the model with Decision Tree Classifier is {round((accuracy_score(y_test, y_preds) * 100), 2)}%"

# 3 - Random Forest Classifier

In [8]:
rfc = RandomForestClassifier(max_depth=2, random_state=0)

rfc.fit(X_train, y_train)
y_preds = lr.predict(X_test)

f"The accuracy of the model with Decision Tree Classifier is {round((accuracy_score(y_test, y_preds) * 100), 2)}%"

'The accuracy of the model with Decision Tree Classifier is 99.28%'

# 4 - SVM

In [9]:
svc = SVC()

svc.fit(X_train, y_train)
y_preds = lr.predict(X_test)

f"The accuracy of the model with Decision Tree Classifier is {round((accuracy_score(y_test, y_preds) * 100), 2)}%"

'The accuracy of the model with Decision Tree Classifier is 99.28%'

# In conclusion we can see that every models is giving the exact same accuracy and its 99.28 % which really incredible

In [10]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_preds})
output.to_csv("submission.csv", index = False)