In [17]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [18]:
df = pd.read_csv('..\\..\\Datasets\\titanic_train.csv')
df.sample()
# df.columns

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
234,235,0,2,"Leyson, Mr. Robert William Norman",male,24.0,0,0,C.A. 29566,10.5,,S


In [19]:
# df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'],axis=1, inplace=True)
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [20]:
# df.info()
df.isnull().sum()
df.sample()
df['Pclass'].unique()

array([3, 1, 2], dtype=int64)

# Planning for Transformation: <br>
Using **SimpleImputer** to fill missing values in Age, Embarked <br>
Using **OneHotEncoder** for Sex,Embarked <br>
Not using LabelEncoder for Survived label because it is already numeric<br>
Not using OrdinalEncoder for Pclass because it is already numeric<br>

## Train Test Split

In [21]:
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,1:8],df.iloc[:,0] ,test_size=0.2,random_state=42)
X_train.sample()
# y_train.sample()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
456,1,male,65.0,0,0,26.55,S


## SimpleImputer

In [22]:
si_age= SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

## OneHotEncoder

In [23]:
ohe_sex = OneHotEncoder(sparse_output=False, handle_unknown='ignore' )  # In transform, if a new category is encountered, the resulting hot encoded column for this feature is all zeros 
ohe_embarked = OneHotEncoder(sparse_output=False,  handle_unknown='ignore' )

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train[['Embarked']])

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test[['Embarked']])

X_train_sex
# X_train_embarked
# X_test_embarked
# X_test_sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

# Recombining

In [24]:
X_train_rem = X_train.drop(columns=['Sex','Embarked','Age'])
X_train_transformed = np.concatenate([X_train_rem,X_train_age,X_train_embarked,X_train_sex],axis=1)
# print(X_train_rem.shape)
# print(X_train_age.shape)
# print(X_train_sex.shape)
print(X_train_embarked.shape)
# print(X_train_transformed.shape)

X_test_rem = X_test.drop(columns=['Sex','Embarked','Age'])
X_test_transformed = np.concatenate([X_test_rem,X_test_age,X_test_embarked,X_test_sex],axis=1)
# print(X_test_rem.shape)
# print(X_test_age.shape)
# print(X_test_sex.shape)
print(X_test_embarked.shape)
# print(X_test_transformed.shape)

(712, 4)
(179, 4)


# Decision Tree

In [25]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

y_pred = clf.predict(X_test_transformed)
y_pred

array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1], dtype=int64)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred,y_test)

0.7821229050279329

In [27]:
import pickle

pickle.dump(ohe_sex, open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked, open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf, open('models/clf.pkl','wb'))
