# HyperParameter tuning 

## grid search - Titanic dataset 

### 1. Import the library

In [1]:
import numpy as np
import pandas as pd

### 2. Load the data

In [2]:
titanic_data= pd.read_csv(r'titanic-1.csv')

In [3]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 3. data preprocessing

In [4]:
# drop the unnecessary columns

In [5]:
titanic_data= titanic_data.drop(columns=['PassengerId','Name','Ticket','Cabin'],axis=1)

In [7]:
titanic_data.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S


# Handle missing values

In [8]:
titanic_data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
titanic_data['Age'].fillna(titanic_data['Age'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Age'].fillna(titanic_data['Age'].mean(),inplace=True)


In [10]:
titanic_data['Embarked']=titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0])

In [11]:
titanic_data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [12]:
from sklearn.preprocessing import LabelEncoder

In [15]:
label_encoder = LabelEncoder()

In [16]:
titanic_data['Sex']=label_encoder.fit_transform(titanic_data['Sex'])

In [17]:
titanic_data['Embarked']=label_encoder.fit_transform(titanic_data['Embarked'])

In [18]:
titanic_data.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2


In [19]:
# Identify X and y

In [21]:
X = titanic_data.drop(columns=['Survived'],axis=1)
y= titanic_data['Survived']

In [22]:
# split the data into training (80%) and testing (20%)

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [25]:
X_train.shape,y_train.shape

((712, 7), (712,))

In [26]:
X_test.shape,y_test.shape

((179, 7), (179,))

# Model Building 

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
model_dt = DecisionTreeClassifier(random_state=42)

# Hyper parameter Tuning using Grid searh

### *Grid Searcch
### *Random Searcch
### *Bayesian Searcch

#! . Define Hyper parameter

In [37]:
parameter_grid={
        'criterion':['gini','entropy'],
        'max_depth': [None, 10, 15,20,30,40 ],
        'min_samples_split':[2,5,7,10],
        'min_samples_leaf':[1,2,3,4]
}

In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
grid_search = GridSearchCV(
    estimator=model_dt,
    param_grid=parameter_grid,
    cv=5,
    scoring='accuracy',
    n_jobs= -1,
    verbose=2
)

In [40]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


In [41]:
# get the best parameter 

In [43]:
best_params= grid_search.best_params_
print('best hyperparameter:',best_params)

best hyperparameter: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}


# use the best parameters to build the final decision tree

In [45]:
best_model= DecisionTreeClassifier(**best_params)


In [46]:
best_model.fit(X_train,y_train)

In [48]:
# predict the result
y_pred=best_model.predict(X_test)

In [49]:
# Modle evaluation

In [50]:
from sklearn.metrics import accuracy_score

In [51]:
print('Model Accuracy is ',accuracy_score(y_test,y_pred))

Model Accuracy is  0.8268156424581006
