In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#importing the data and overview 

train_df= pd.read_csv('../input/titanic/train.csv')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
train_df.describe()

In [None]:
train_df['Survived'].value_counts()

# survived = 1
# didn't survive = 0

In [None]:
sns.countplot(data=train_df, x='Survived')

# survived = 1
# didn't survive = 0

In [None]:
sns.heatmap(train_df.corr(), annot=True)

In [None]:
#Data preparation
#looking for null values and taking care of them

train_df.isnull().sum()

In [None]:
train_df["Age"].fillna(train_df["Age"].mean(), inplace = True)

In [None]:
train_df['Sex']=train_df['Sex'].replace('male', 0)
train_df['Sex']=train_df['Sex'].replace('female', 1)

In [None]:
train_df.drop(['Name', 'PassengerId', 'Fare', 'Ticket','Embarked', 'Cabin'], axis = 1, inplace = True)

In [None]:
train_df.isnull().sum()

In [None]:
missing=train_df.isnull().sum().sort_values(ascending=False)
missing=missing.drop(missing[missing==0].index)
missing

In [None]:
#defining features and the label

X= train_df.drop('Survived', axis=1)
y= train_df['Survived']

In [None]:
#spliting the dataset to work on train and test


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
#feature scaling


from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()
scaler.fit(X_train)

In [None]:
scaled_X_train= scaler.transform(X_train)
scaled_X_test= scaler.transform(X_test)

In [None]:
#training the model


from sklearn.tree import DecisionTreeClassifier

decisiontree_model = DecisionTreeClassifier()
decisiontree_model.fit(scaled_X_train,y_train)

In [None]:
#predicting test

y_pred = decisiontree_model.predict(scaled_X_test)

In [None]:
#testing the model

from sklearn.metrics import confusion_matrix,classification_report

confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
decisiontree_model.feature_importances_

In [None]:
pd.DataFrame(index=X.columns,data=decisiontree_model.feature_importances_,columns=['Feature Importance'])

In [None]:
# The decision trees visualization

from sklearn.tree import plot_tree


plt.figure(figsize=(12,8),dpi=150)
plot_tree(decisiontree_model,filled=True,feature_names=X.columns);

In [None]:
# Report back the classification results

def REP(tree_model):
    preds = tree_model.predict(scaled_X_test)
    print(classification_report(y_test,preds))
    print('\n')
    plt.figure(figsize=(12,8),dpi=150)
    plot_tree(tree_model,filled=True,feature_names=X.columns)

In [None]:
#Hyperparameters

pruning = DecisionTreeClassifier(max_depth=2)
pruning.fit(scaled_X_train,y_train)

In [None]:
REP(pruning)

In [None]:
# max leaf nodes

pruning = DecisionTreeClassifier(max_leaf_nodes=3)
pruning.fit(scaled_X_train,y_train)

In [None]:
REP(pruning)

In [None]:
# criterion

entropy = DecisionTreeClassifier(criterion='entropy')
entropy.fit(scaled_X_train,y_train)

In [None]:
REP(entropy)