# Import all Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Import Dataset

In [None]:
df=pd.read_csv('../input/titanic/train.csv')

# Data Overview

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

# EDA

In [None]:
df['Survived'].value_counts()

In [None]:
sns.countplot(data=df, x='Survived')

In [None]:
df.corr()['Survived'].sort_values()

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.pairplot(data=df)

# Data Preparation

In [None]:
((df.isnull().sum())/len(df))*100

In [None]:
df.drop('Cabin',axis=1,inplace=True)

In [None]:
df['Age'].fillna(df['Age'].mean(), inplace = True)

In [None]:
#There isn't missing values 
((df.isnull().sum())/len(df))*100

In [None]:
df.info()

In [None]:
def missing_percent(df):
    nan_percent=((df.isnull().sum())/len(df))*100
    nan_percent=nan_percent[nan_percent>0].sort_values()
    return nan_percent

In [None]:
nan_percent=missing_percent(df)

In [None]:
nan_percent

In [None]:
df.drop(['PassengerId','Name','Ticket','Fare','Embarked'],axis=1, inplace=True)

In [None]:
df.loc[df['Sex']=='male','Sex']=1
df.loc[df['Sex']=='female','Sex']=0

In [None]:
#There isn't nan_percent
def missing_percent(df):
    nan_percent=((df.isnull().sum())/len(df))*100
    nan_percent=nan_percent[nan_percent>0].sort_values()
    return nan_percent

In [None]:
nan_percent=missing_percent(df)

In [None]:
nan_percent

# Features & Label

In [None]:
X=df.drop('Survived',axis=1)
y=df['Survived']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

# Scalling The Features

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
scaled_X_train= scaler.transform(X_train)
scaled_X_test= scaler.transform(X_test)

# Train the Model

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)

# Predicting Test Data

In [None]:
y_pred = model.predict(X_test)

# Evaluating Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
model.feature_importances_

In [None]:
pd.DataFrame(index=X.columns,data=model.feature_importances_,columns=['Feature Importance'])

#  Visualize the Tree

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(12,8))
plot_tree(model);

In [None]:
plt.figure(figsize=(12,8),dpi=150)
plot_tree(model,filled=True,feature_names=X.columns);

# Reporting Model Results

In [None]:
def report_model(model):
    model_preds = model.predict(X_test)
    print(classification_report(y_test,model_preds))
    print('\n')
    plt.figure(figsize=(12,8),dpi=150)
    plot_tree(model,filled=True,feature_names=X.columns);

# Understanding Hyperparameters


In [None]:
help(DecisionTreeClassifier)

In [None]:
pruned_tree = DecisionTreeClassifier(max_depth=2)
pruned_tree.fit(X_train,y_train)

In [None]:
report_model(pruned_tree)

# Max Leaf Nodes

In [None]:
pruned_tree = DecisionTreeClassifier(max_leaf_nodes=3)
pruned_tree.fit(X_train,y_train)

In [None]:
report_model(pruned_tree)

# Criterion

In [None]:
entropy_tree = DecisionTreeClassifier(criterion='entropy')
entropy_tree.fit(X_train,y_train)

In [None]:
report_model(entropy_tree)