In [16]:
import pandas as pd

In [75]:
url = "https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv"
df = pd.read_csv(url)

In [76]:
df = df.drop(labels=["PassengerId", "Name", "Ticket"], axis=1)

In [77]:
## Independent and dependent features
X = df.drop(labels=['Survived'],axis=1)
Y = df[['Survived']]

In [78]:
X["Has_Cabin"] = (X["Cabin"].notnull().astype('int'))
X = X.drop(['Cabin'], axis=1)

In [79]:
X["no_of_persons"] = X["SibSp"] + X["Parch"]

In [80]:
X = X.drop(labels=["SibSp", "Parch"], axis=1)

In [81]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [82]:
print("Numerical columns:",numerical_cols)
print('Categorical Columns:',categorical_cols)

Numerical columns: Index(['Pclass', 'Age', 'Fare', 'Has_Cabin', 'no_of_persons'], dtype='object')
Categorical Columns: Index(['Sex', 'Embarked'], dtype='object')


In [83]:
# numerical_columns_list = list(numerical_cols)
# numerical_columns_list.remove('Has_Cabin')
# numerical_columns_list.remove('Pclass')
# # Convert the list back to Index
# numerical_cols = pd.Index(numerical_columns_list)

# # Now numerical_columns will have the modified values
# print(numerical_cols)

In [84]:
# categorical_columns_list = list(categorical_cols)
# categorical_columns_list.extend(['Has_Cabin', 'Pclass'])
# # Convert the list back to Index
# categorical_cols = pd.Index(categorical_columns_list)

# # Now numerical_columns will have the modified values
# print(categorical_cols)

In [85]:
# Define the custom ranking for each ordinal variable
sex_categories = ['male', 'female']
embarked_categories = ['S', 'C', 'Q']
# PClass_categories = [1,2,3]
# HasCabin_categories = [0,1]

In [86]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [87]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[sex_categories,embarked_categories])),
    ('scaler',StandardScaler())
    ]
)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [88]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [89]:
print(X_train.shape, y_train.shape)

(623, 7) (623, 1)


In [90]:
print(X_train.head())

     Pclass     Sex   Age     Fare Embarked  Has_Cabin  no_of_persons
862       1  female  48.0  25.9292        S          1              0
704       3    male  26.0   7.8542        S          0              1
525       3    male  40.5   7.7500        Q          0              0
234       2    male  24.0  10.5000        S          0              0
518       2  female  36.0  26.0000        S          0              1


In [91]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [92]:
X_train.head()

Unnamed: 0,num_pipeline__Pclass,num_pipeline__Age,num_pipeline__Fare,num_pipeline__Has_Cabin,num_pipeline__no_of_persons,cat_pipeline__Sex,cat_pipeline__Embarked
0,-1.610884,1.476292,-0.112476,1.9569,-0.551456,1.358215,-0.590696
1,0.798674,-0.221695,-0.477431,-0.511012,0.069805,-0.73626,-0.590696
2,0.798674,0.897433,-0.479535,-0.511012,-0.551456,-0.73626,2.58175
3,-0.406105,-0.376057,-0.424009,-0.511012,-0.551456,-0.73626,-0.590696
4,-0.406105,0.550118,-0.111046,-0.511012,0.069805,1.358215,-0.590696


In [94]:
## Model Training

from sklearn.tree import DecisionTreeClassifier

In [95]:
clf = DecisionTreeClassifier(max_depth =3, random_state = 42)

clf.fit(X_train, y_train)

In [102]:
test_pred_decision_tree = clf.predict(X_test)

In [111]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
confusion_matrix = confusion_matrix(y_test,  
                                            test_pred_decision_tree)
print(confusion_matrix)

[[143  17]
 [ 36  72]]


In [112]:
print("Model Score on Train data is:", clf.score(X_train, y_train))
print("Model Score on Test data is:", clf.score(X_test, y_test))


Model Score on Train data is: 0.8378812199036918
Model Score on Test data is: 0.8022388059701493
