In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [43]:
df = pd.read_csv("/content/Titanic-Dataset.csv")

In [44]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [45]:
import warnings
warnings.filterwarnings('ignore')

In [46]:
df = df.drop(columns = ['PassengerId' , 'Name' , 'Ticket' , 'Cabin'])
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [47]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [48]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [49]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [50]:
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [51]:
X = df.iloc[:,1:]
X.shape

(891, 7)

In [52]:
y = df.iloc[:,:1]
y.shape

(891, 1)

In [53]:
X_train , X_test , y_train , y_test = train_test_split(X,y , random_state = 0 , test_size = 0.3)

In [54]:
transformer  = ColumnTransformer(transformers = [
    ('tf1' , SimpleImputer(), ['Age'] ),
    ('tf2' , OneHotEncoder(sparse = False , drop = 'first' , dtype = np.int32) , ['Sex' , 'Embarked'] )
] , remainder = 'passthrough')

In [55]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [56]:
X_train_transformed.shape , X_test_transformed.shape

((623, 9), (268, 9))

In [57]:
X_train_transformed = pd.DataFrame(X_train_transformed)
X_test_transformed = pd.DataFrame(X_test_transformed)

In [58]:
X_train_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,51.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,26.55
1,49.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,76.7292
2,1.0,1.0,0.0,1.0,0.0,3.0,5.0,2.0,46.9
3,54.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,77.2875
4,29.915339,0.0,0.0,0.0,0.0,3.0,1.0,0.0,14.4583


In [59]:
scaler = StandardScaler()

In [60]:
X_train_scaled = scaler.fit_transform(X_train_transformed)
X_test_scaled = scaler.transform(X_test_transformed)

In [61]:
X_train_scaled = pd.DataFrame(X_train_scaled)
X_test_scaled = pd.DataFrame(X_test_scaled)

In [62]:
np.round(X_train.describe() , 1)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,623.0,502.0,623.0,623.0,623.0
mean,2.3,29.9,0.5,0.4,32.5
std,0.8,14.5,1.2,0.8,48.3
min,1.0,0.7,0.0,0.0,0.0
25%,1.5,21.0,0.0,0.0,7.9
50%,3.0,29.0,0.0,0.0,15.0
75%,3.0,38.0,1.0,0.0,31.4
max,3.0,80.0,8.0,6.0,512.3


In [63]:
np.round(X_train_scaled.describe() , 1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0,623.0
mean,-0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-2.2,-1.4,-0.3,-1.7,-0.1,-1.5,-0.5,-0.5,-0.7
25%,-0.5,-1.4,-0.3,-1.7,-0.1,-0.9,-0.5,-0.5,-0.5
50%,-0.0,0.7,-0.3,0.6,-0.1,0.8,-0.5,-0.5,-0.4
75%,0.5,0.7,-0.3,0.6,-0.1,0.8,0.4,-0.5,-0.0
max,3.8,0.7,3.2,0.6,17.6,0.8,6.4,6.7,10.0


In [64]:
model = LogisticRegression()

In [65]:
model.fit(X_train_scaled , y_train)

In [66]:
y_pred = model.predict(X_test_scaled)
y_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1])

In [67]:
from sklearn.metrics import accuracy_score

In [68]:
print('Accuracy Score:' , accuracy_score(y_test , y_pred)*100)

Accuracy Score: 79.47761194029852
