In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [16]:
titanic_data = pd.read_csv('Titanic_original.csv')

In [17]:
y = titanic_data['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [26]:
X = titanic_data.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [27]:
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.9250,0,0,1
3,1,35.0,1,0,53.1000,0,0,1
4,3,35.0,0,0,8.0500,1,0,1
...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,1,0,1
887,1,19.0,0,0,30.0000,0,0,1
888,3,,1,2,23.4500,0,0,1
889,1,26.0,0,0,30.0000,1,0,0


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test

(     Pclass   Age  SibSp  Parch      Fare  Sex_male  Embarked_Q  Embarked_S
 331       1  45.5      0      0   28.5000         1           0           1
 733       2  23.0      0      0   13.0000         1           0           1
 382       3  32.0      0      0    7.9250         1           0           1
 704       3  26.0      1      0    7.8542         1           0           1
 813       3   6.0      4      2   31.2750         0           0           1
 ..      ...   ...    ...    ...       ...       ...         ...         ...
 106       3  21.0      0      0    7.6500         0           0           1
 270       1   NaN      0      0   31.0000         1           0           1
 860       3  41.0      2      0   14.1083         1           0           1
 435       1  14.0      1      2  120.0000         0           0           1
 102       1  21.0      0      1   77.2875         1           0           1
 
 [712 rows x 8 columns],
      Pclass   Age  SibSp  Parch     Fare  Sex_ma

In [29]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_train_imputed

array([[ 1. , 45.5,  0. , ...,  1. ,  0. ,  1. ],
       [ 2. , 23. ,  0. , ...,  1. ,  0. ,  1. ],
       [ 3. , 32. ,  0. , ...,  1. ,  0. ,  1. ],
       ...,
       [ 3. , 41. ,  2. , ...,  1. ,  0. ,  1. ],
       [ 1. , 14. ,  1. , ...,  0. ,  0. ,  1. ],
       [ 1. , 21. ,  0. , ...,  1. ,  0. ,  1. ]])

In [30]:
X_test_imputed = imputer.transform(X_test)
X_test_imputed

array([[ 3.        , 29.49884615,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 2.        , 31.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 3.        , 20.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 3.        , 38.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 2.        , 17.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 3.        ,  4.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

In [35]:
clf = DecisionTreeClassifier(random_state=42)

In [36]:
clf.fit(X_train_imputed, y_train)

In [37]:
y_pred = clf.predict(X_test_imputed)

In [25]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7932960893854749


In [38]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV

In [39]:
clf = tree.DecisionTreeClassifier()

In [55]:
parameters = {
    'max_features': ['log2', 'sqrt', 'auto'],
    'criterion': ['entropy', 'gini'],
    'max_depth': [2, 3, 5, 10],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 5, 8]
}

In [41]:
grid_obj = GridSearchCV(clf, parameters, cv=5)

In [57]:
grid_obj = grid_obj.fit(X_train_imputed, y_train)
grid_obj

















In [43]:
clf = grid_obj.best_estimator_

In [46]:
feature_importance = clf.feature_importances_
print("Feature Importance:", feature_importance)

Feature Importance: [0.11203081 0.02481044 0.07165315 0.07404572 0.13141726 0.56312032
 0.0229223  0.        ]


In [47]:
best_features = len(feature_importance)
print("Number of Features:", best_features)

Number of Features: 8


In [48]:
from sklearn.metrics import confusion_matrix, f1_score

In [56]:
y_pred = clf.predict(X_test_imputed)
y_pred

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0], dtype=int64)

In [50]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[99  6]
 [28 46]]


In [52]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8100558659217877


In [53]:
f1 = f1_score(y_test, y_pred)
print("F1-Score:", f1)

F1-Score: 0.7301587301587302
