## Load the data

In [1]:
import pandas as pd
df = pd.read_csv('titanic.csv')
df.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


## Step:1 Pre-processing

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [13]:
col_to_use = list(df.columns)

In [15]:
col_to_use.remove('Cabin')

In [17]:
df=df[col_to_use]
df.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S


In [19]:
df=df.dropna()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          712 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 66.8+ KB


## Step:2 Feature Engineering

In [21]:
df['sex_encoded'] = df['Sex'].astype('category').cat.codes
df['embarked_enc'] = df['Embarked'].astype('category').cat.codes

In [26]:
df.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,sex_encoded,embarked_enc
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,2


### 2.a Chi-Square test for feature selection

In [25]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [28]:
X_cat = df[['Pclass','SibSp','Parch','sex_encoded','embarked_enc']]
Y = df['Survived']

In [31]:
fs = SelectKBest(score_func = chi2, k=4)
X_selected = fs.fit_transform(X_cat,Y)

In [32]:
fs.get_support(True)

array([0, 2, 3, 4], dtype=int64)

In [33]:
selected_categorical_columns = ['Pclass','Parch','sex_encoded','embarked_enc']

In [34]:
selected_numerical_columns = ['Age','Fare']

### 2.b Feature scaling the numerical values

In [38]:
from sklearn.preprocessing import StandardScaler

In [39]:
scaler = StandardScaler()

In [41]:
X2=scaler.fit_transform(df[['Age','Fare']])

In [44]:
df['Scaled_Age'] = X2[:,0]
df['Scaled_Fare'] = X2[:,1]

In [45]:
df.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,sex_encoded,embarked_enc,Scaled_Age,Scaled_Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,1,2,-0.527669,-0.51638
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,0,0,0.577094,0.694046
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,2,-0.251478,-0.50362
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,2,0.369951,0.350326


In [46]:
final_column_to_use = selected_categorical_columns + ['Scaled_Age','Scaled_Fare']

In [47]:
final_column_to_use

['Pclass', 'Parch', 'sex_encoded', 'embarked_enc', 'Scaled_Age', 'Scaled_Fare']

In [48]:
X = df[final_column_to_use]

In [49]:
Y= df['Survived']

## Step 3: Modeling

In [51]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =  train_test_split(X,Y, test_size=0.8, shuffle=True)

### 3.a Hyperparameter tunning

In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [53]:
clf = DecisionTreeClassifier()

In [56]:
parameter = {
    
    "criterion":['gini','entropy'],
    "max_depth":[5,10,15,20],
    "min_samples_leaf": [2,4,8,10],
}

In [58]:
grid_search = GridSearchCV(clf,parameter)

In [59]:
grid_search.fit(X_train,y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15, 20],
                         'min_samples_leaf': [2, 4, 8, 10]})

In [62]:
grid_search.best_estimator_

DecisionTreeClassifier(max_depth=5, min_samples_leaf=2)

In [63]:
final_model = DecisionTreeClassifier(max_depth=5, min_samples_leaf=2)

In [64]:
final_model.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=5, min_samples_leaf=2)

In [65]:
y_pred = final_model.predict(X_test)

## Step 4: Model performance Evaluation

In [67]:
from sklearn.metrics import classification_report,accuracy_score

In [68]:
print(f"The Accurcay of the mode is {accuracy_score(y_test,y_pred)*100}")

The Accurcay of the mode is 74.3859649122807


In [70]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.76      0.78       340
           1       0.67      0.71      0.69       230

    accuracy                           0.74       570
   macro avg       0.73      0.74      0.74       570
weighted avg       0.75      0.74      0.74       570

