In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = sns.load_dataset('penguins')

In [3]:
df.shape

(344, 7)

In [4]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [5]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [6]:
df.dropna(inplace = True)

In [8]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [9]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [10]:
#Convert categorical into numerical
df['sex'] = pd.get_dummies(df['sex'], drop_first=True)

# get_dummies is a function provided by the Pandas library that creates a set of binary columns (also known as dummy variables)
# for each unique category in a categorical variable. 

In [11]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,1


In [12]:
#Convert island into numerical
island = pd.get_dummies(df['island'], drop_first=True)

In [13]:
#Concat island to df because now we have one got 1 extra column
df2 = pd.concat([df, island], axis = 1)

In [14]:
df2.shape

(333, 9)

In [15]:
df2.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,1,0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,0,0,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,0,0,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,0,0,1
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,1,0,1


In [16]:
df2.drop('island', axis = 1, inplace = True)

In [17]:
#Mapping manually instead of label encoding
y = df2.species
y

y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [18]:
y = y.map({'Adelie': 0, 'Chinstrap':1, 'Gentoo':2})

In [19]:
df2.drop('species', axis = 1, inplace = True)

In [20]:
df2.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen
0,39.1,18.7,181.0,3750.0,1,0,1
1,39.5,17.4,186.0,3800.0,0,0,1
2,40.3,18.0,195.0,3250.0,0,0,1
4,36.7,19.3,193.0,3450.0,0,0,1
5,39.3,20.6,190.0,3650.0,1,0,1


In [21]:
#Splitting data into train and test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df2, y, test_size = 0.3, random_state = 42)

In [22]:
#importing RandomForestClassifier from ensemble module
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)

In [23]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

[[48  0  0]
 [ 1 17  0]
 [ 0  0 34]]


In [24]:
#Accuracy SCore
from sklearn.metrics import accuracy_score
ac = accuracy_score(y_test, y_pred)

print(ac)

0.99


In [25]:
#test for bias and variance
bias = rfc.score(x_train, y_train)
variance = rfc.score(x_test, y_test)

print(bias, variance)

1.0 0.99
