In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <font color='#6f7578'>Palmer Archipelago (Antarctica) penguin data EDA and decision tree model</font>

## <font color='#34b4eb'>Exploration of data set and creation of decision tree model to make prediction on species given penguin features</font>

In [None]:
df = pd.read_csv("/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv")

In [None]:
df.head()

In [None]:
df["species"].unique()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df = df.dropna()

In [None]:
df.info()

In [None]:
df["island"].unique()

In [None]:
df["sex"].unique()

In [None]:
df[df["sex"] == "."]

In [None]:
df[df["species"] == "Gentoo"].groupby("sex").describe().transpose()

## <font color='#34b4eb'>Given the mean feature values "." leans towards being a female</font>

In [None]:
df.at[336,"sex"] = "FEMALE"  

In [None]:
df.loc[336]

In [None]:
sns.pairplot(df,hue="species",palette="Paired")

In [None]:
sns.catplot(x="species", y="culmen_length_mm", data=df, kind="box", col="sex", palette="Paired")

In [None]:
df.head()

## <font color='#34b4eb'>Create dummy variables and drop the label</font>

In [None]:
X = pd.get_dummies(df.drop("species", axis=1), drop_first=True)
X

In [None]:
y = df["species"]
y

## <font color='#34b4eb'>NB - no scaling of data required for decision tree algorithm</font>

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.tree import DecisionTreeClassifier

## <font color='#34b4eb'>Will just create the model with default hyperparameters</font>

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
base_preds = model.predict(X_test)

In [None]:
base_preds

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix

In [None]:
print(classification_report(y_test,base_preds))

In [None]:
plot_confusion_matrix(model,X_test,y_test,cmap="Accent")

## <font color='#34b4eb'>Only 3 penguins were misclassified</font>

In [None]:
model.feature_importances_

In [None]:
X.columns

In [None]:
pd.DataFrame(index=X.columns, data=model.feature_importances_)

In [None]:
pd.DataFrame(index=X.columns, data=model.feature_importances_,columns=["feature importance"]).sort_values("feature importance")

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(11,11),dpi=150)
plot_tree(model,feature_names=X.columns, filled=True);