In [None]:
import pandas as pd

data = pd.read_csv('../data/U4_04_train.csv')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
import seaborn as sb
sb.countplot(x="Survived", data=data)

In [None]:
sb.countplot(x="Survived", data=data, hue="Sex")

In [None]:
# Validate null or empty data in columns
data.isna().sum()

In [None]:
sb.displot(x="Age", data=data)

In [None]:
data["Age"]

In [None]:
# Average
data["Age"].mean()

In [None]:
data["Age"] = data["Age"].fillna(data["Age"].mean())

In [None]:
data["Age"]

In [None]:
data.isna().sum()

In [None]:
# Remove Cabin columnd
data = data.drop(["Cabin"], axis=1)

In [None]:
data["Embarked"].value_counts()

In [None]:
#Remove empty Embarked rows
data = data.dropna()

In [None]:
data.isna().sum()

In [None]:
# Remove innecesary columns
data = data.drop(["Name", "PassengerId", "Ticket"], axis=1)

In [None]:
data.head()

In [None]:
# transform Sex data
dummies_sex = pd.get_dummies(data["Sex"], drop_first=True)
dummies_sex.rename(columns={'male': 'Male'}, inplace=True)
dummies_sex
# dummies_sex.head()
#drop_first=True additionally avoid redundancy and concept "multicollinearity":
# model find correlations where it hasn't between Female and Male and the model concentrate in trainig this part

In [None]:
#Add new Male column
data = data.join(dummies_sex)
data = data.drop(["Sex"], axis=1)

In [None]:
data.head()

In [None]:
sb.countplot(x="Survived", data=data, hue="Embarked")

In [None]:
dummies_embarked = pd.get_dummies(data["Embarked"], drop_first=True)

In [None]:
data = data.join(dummies_embarked)
data = data.drop(["Embarked"], axis=1)

# When is Q is True o -> 1
# When is S is True o -> 1
# When is C is Q and S False -> 0

In [None]:
data.head()

In [None]:
sb.heatmap(data.corr(), annot=True, cmap="YlGnBu")

In [None]:
sb.countplot(x="Survived", data=data, hue="Pclass")

In [None]:
X = data.drop(["Survived"], axis=1)
y = data["Survived"]

In [None]:
X
y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# criterion # by default gini but can be entropy, log_loss
# max_depth # what deep can be the tree, more deep better but if deep is very deep can generate overfitting
# both params before are called hyperparameters
accuracies = []
for i in range(1,10):
    model = DecisionTreeClassifier(max_depth=i)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Result for {i}: {accuracy})")
    accuracies.append(accuracy)

In [None]:
sb.lineplot(data=accuracies)

In [None]:
import random
random_num = random.randrange(1,9)
print(f"Random depth: {random_num}")
model = DecisionTreeClassifier(max_depth=random_num)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, predictions)
pd.DataFrame(conf_matrix, columns=["Pred: No", "Pred: Yes"], index=["Real: No", "Real: Yes"])

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(100,80))
plot_tree(
    model,
    feature_names=X_train.columns,
    class_names=["Died", "Lived"],
    filled=True,
    label="none"
)