In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Palmer Archipelago (Antarctica) penguin data.

## Data Source: Dr. Kirtsten Gorman and Palmer Station, Antartica, LTER.
### Problem Statement: We compare different physical aspects of Penguins like, body mass, flipper length, culmen length, culmen depth to predict the species of the Penguin.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import seaborn as sns

In [None]:
df = pd.read_csv("../input/penguins/penguins.csv")

In [None]:
df

### The dataset has 344 rows and 9 columns

In [None]:
df.info()

In [None]:
df.shape

In [None]:
# glance what dtype does each variable in the columns are represented
df.dtypes

In [None]:
missing_values =  ["NA", "NAN", "n/a", "na", "Empty", "--"]

In [None]:
df = pd.read_csv("../input/penguins/penguins.csv", na_values = missing_values)

In [None]:
# Which columns are the missing values exactly found. There are 7 columns with null values.
df.isnull().any() 

In [None]:
# assign numerical value to the boolen value of sex column
def binarize_sex(val):
    if val == 'Male':
        return 1
    else:
        return 0  

In [None]:
df['sex'] = df['sex'].apply(binarize_sex)

In [None]:
# drop null values and update it in the dataframe
df.dropna(inplace=True)

In [None]:
# Checking if all null values are taken care of.
df.isnull().any()

In [None]:
df=df.dropna()
sns.heatmap(df.isna())
plt.show()

In [None]:
###The heatmap vizualization shows that there are no null values and the data is ready for further exploration.

# Summary Statistics

In [None]:
df.describe()

#### Describe column names

In [None]:
df.columns

In [None]:
# mean of the bill_depth
df['bill_depth_mm'].mean()

In [None]:
# mean of the bill_length
df['bill_length_mm'].mean()

In [None]:
# median body_mass
df['body_mass_g'].median()

In [None]:
# species count
df['species'].count()

In [None]:
df.head()

# Data Visualization

In [None]:
plt.pie(df["species"].value_counts(),labels = df["species"].unique())
plt.show()

In [None]:
c = df.groupby('species')['species'].count()

In [None]:
# data distribution count between different species of penguins
c

In [None]:
# plot by species
sns.countplot('species',data=df, palette=('DarkOrange', 'MediumOrchid', 'Teal'))
plt.show()

In [None]:
# plot showing species distribution and their count
sns.countplot(x = "island", data = df)

In [None]:
sns.barplot( x = "island", y = "body_mass_g",  hue = 'species', data = df)

In [None]:
sns.pairplot(data = df, hue='species', palette=None)

In [None]:
sns.scatterplot(x = df.bill_length_mm, y = df.bill_depth_mm, hue = df.species, palette=None)

In [None]:
sns.scatterplot(x = df.body_mass_g, y = df.flipper_length_mm, hue = df.species, palette=None)

In [None]:
sns.heatmap(df.corr())

# Data Partition

In [None]:
# split  dataset into train and test sets
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
# create dataset
X, y = make_blobs(n_samples=1000)
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Decision Tree Algorithm

In [None]:
parameters = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']
result = "species"

In [None]:
model = DecisionTreeClassifier(max_depth=5)
model.fit(df[parameters], df[result])

In [None]:
model.score(df[parameters], df[result])

In [None]:
dot_data = tree.export_graphviz(model, feature_names=parameters, filled=True, rounded=True) 
graph = graphviz.Source(dot_data)
graph