In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
titanic_data = pd.read_csv('titanic.csv')

In [4]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
titanic_data.head(2)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [7]:
titanic_data.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)

In [8]:
titanic_data= pd.get_dummies(titanic_data, columns=['Sex','Embarked'], drop_first=True)

In [9]:
X=titanic_data.drop('Survived', axis=1)
y=titanic_data['Survived']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
clf_baseline = DecisionTreeClassifier(random_state=42)

In [12]:
clf_baseline.fit(X_train, y_train)

In [13]:
y_pred_baseline= clf_baseline.predict(X_test)

In [14]:
print("Baseline Model Accuracy: ",accuracy_score(y_test, y_pred_baseline))

Baseline Model Accuracy:  0.7541899441340782


We can see that the model's accuracy is around 75%

In [15]:
from sklearn.model_selection import GridSearchCV
parameters= {'max_depth': [2,4,6,8,10,12],
             'min_samples_split':range(2,10),
             'min_samples_leaf':range(1,10)}

In [16]:
clf=GridSearchCV(DecisionTreeClassifier(random_state=42), parameters, n_jobs=4)

In [17]:
clf.fit(X_train, y_train)
best_tree=clf.best_estimator_
print('Hyperparameters of the best model:',clf.best_params_)

Hyperparameters of the best model: {'max_depth': 8, 'min_samples_leaf': 7, 'min_samples_split': 2}


In [18]:
y_pred_tuned = best_tree.predict(X_test)

In [19]:
print('Tuned Model Accuracy:', accuracy_score(y_test, y_pred_tuned))

Tuned Model Accuracy: 0.8379888268156425


We can see that the model's accuracy went up to 84%. It is amazing how our accuracy improved with a bit of fine-tuning.