In [73]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings(category=FutureWarning,action='ignore')

In [74]:
df=pd.read_csv("ffiles/train.csv")

In [75]:
df.shape

(891, 12)

In [76]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [77]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [78]:
df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [79]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [80]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [81]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [82]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


# Imputation 

In [83]:
si1=SimpleImputer()
si2=SimpleImputer(strategy='most_frequent')

xtrain_impute_age=si1.fit_transform(X_train[["Age"]])
xtrain_impute_embarked=si2.fit_transform(X_train[["Embarked"]])

xtest_impute_age=si1.transform(X_test[["Age"]])
xtest_impute_embarked=si2.transform(X_test[["Embarked"]])


# clf=ColumnTransformer(transformers=[(trf_age),["Age"]])

In [84]:
X_train.isna().sum()

Pclass        0
Sex           0
Age         140
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [85]:
np.unique(xtrain_impute_embarked,return_counts=True)

(array(['C', 'Q', 'S'], dtype=object), array([125,  60, 527]))

# one hot encode

In [103]:
ohe_sex=OneHotEncoder(sparse_output=False,handle_unknown='ignore')#dropfirst for linear models - to avoid multicollinearity
ohe_embarked=OneHotEncoder(sparse_output=False,handle_unknown='ignore')
xtrain_sex_encode=ohe_sex.fit_transform(X_train[["Sex"]])
xtest_sex_encode=ohe_sex.transform(X_test[["Sex"]])
xtrain_embarked_encode=ohe_embarked.fit_transform(xtrain_impute_embarked)
xtest_embarked_encode=ohe_embarked.transform(xtest_impute_embarked)

In [104]:
xtrain_embarked_encode

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [105]:
xtrain_sex_encode

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [106]:
X_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S


In [107]:
xtrain_rem=X_train.drop(columns=["Sex","Embarked","Age"])
xtest_rem=X_test.drop(columns=["Sex","Embarked","Age"])

In [108]:
xtrain_rem.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare
331,1,0,0,28.5
733,2,0,0,13.0
382,3,0,0,7.925
704,3,1,0,7.8542
813,3,4,2,31.275


In [109]:
xtrain_final = np.concatenate((xtrain_rem, xtrain_embarked_encode, xtrain_impute_age, xtrain_sex_encode), axis=1)
xtest_final = np.concatenate((xtest_rem, xtest_embarked_encode, xtest_impute_age, xtest_sex_encode), axis=1)

In [110]:
xtest_sex_encode.shape

(179, 2)

In [114]:
clf=DecisionTreeClassifier()
clf.fit(xtrain_final,y_train)

In [113]:
from sklearn import set_config
set_config(display='diagram')

In [115]:
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [116]:
print("accuracy_score:",accuracy_score(y_test,clf.predict(xtest_final)))
print("confusion_matrix:\n",confusion_matrix(y_test,clf.predict(xtest_final)))
print("classification_report:\n",classification_report(y_test,clf.predict(xtest_final)))

accuracy_score: 0.776536312849162
confusion_matrix:
 [[85 20]
 [20 54]]
classification_report:
               precision    recall  f1-score   support

           0       0.81      0.81      0.81       105
           1       0.73      0.73      0.73        74

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



# Pickling the model 

In [117]:
import pickle

In [119]:
pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb')) # no impute here because it is assumed that  we give the correct data not an nan data for age  ie assumed age is mandatory 
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))