In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
df = sns.load_dataset("penguins")
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [None]:
df.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex'],
      dtype='object')

In [None]:
df.shape

(344, 7)

In [None]:
df['species'].value_counts()

Adelie       152
Gentoo       124
Chinstrap     68
Name: species, dtype: int64

In [None]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

Feature engineering:

One Hot encoding to transform categorical data into numerical data.

In [None]:
df["sex"].unique()

array(['Male', 'Female'], dtype=object)

In [None]:
# the new unique values are converted into respective columns in a dataframe
# better to use it for low unique values
pd.get_dummies(df["sex"]).head()

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,1,0
4,1,0
5,0,1


In [None]:
sex = pd.get_dummies(df["sex"], drop_first=True)

In [None]:
sex.head()

Unnamed: 0,Male
0,1
1,0
2,0
4,0
5,1


In [None]:
# applying one hot encoding to island

pd.get_dummies(df["island"]).head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,0,0,1
1,0,0,1
2,0,0,1
4,0,0,1
5,0,0,1


In [None]:
island = pd.get_dummies(df["island"], drop_first=True)

In [None]:
island.head()

Unnamed: 0,Dream,Torgersen
0,0,1
1,0,1
2,0,1
4,0,1
5,0,1


Concatenating the above 2 dataframes into the original dataframe.

In [None]:
new_data = pd.concat([df, island, sex], axis=1)

In [None]:
new_data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen,Male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,0,1,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0,1,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,1,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0,1,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,0,1,1


In [None]:
new_data.drop(['sex', 'island'], axis=1, inplace=True)

In [None]:
new_data.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,Adelie,39.1,18.7,181.0,3750.0,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,0,1,1


In [None]:
y = new_data.species
y.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [None]:
y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [None]:
y = y.map({'Adelie':0, 'Chinstrap':1, 'Gentoo':2})
y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

Dropping target value

In [None]:
new_data.drop('species', inplace=True, axis=1)

In [None]:
new_data.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,39.1,18.7,181.0,3750.0,0,1,1
1,39.5,17.4,186.0,3800.0,0,1,0
2,40.3,18.0,195.0,3250.0,0,1,0
4,36.7,19.3,193.0,3450.0,0,1,0
5,39.3,20.6,190.0,3650.0,0,1,1


In [None]:
x = new_data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators=5, criterion='entropy', random_state=0)

In [None]:
classifier.fit(xtrain, ytrain)

In [None]:
y_pred = classifier.predict(xtest)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
confusion_matrix(ytest, y_pred)

array([[48,  0,  0],
       [ 2, 14,  0],
       [ 0,  0, 36]])

In [None]:
accuracy_score(ytest, y_pred)

0.98

In [None]:
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        48
           1       1.00      0.88      0.93        16
           2       1.00      1.00      1.00        36

    accuracy                           0.98       100
   macro avg       0.99      0.96      0.97       100
weighted avg       0.98      0.98      0.98       100



In [None]:
classifier2 = RandomForestClassifier(n_estimators=7, criterion='gini', random_state=0)

In [None]:
classifier2.fit(xtrain, ytrain)
y_pred2 = classifier2.predict(xtest)

In [None]:
accuracy_score(ytest, y_pred2)

0.99

In [None]:
print(classification_report(ytest, y_pred2))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        48
           1       1.00      0.94      0.97        16
           2       1.00      1.00      1.00        36

    accuracy                           0.99       100
   macro avg       0.99      0.98      0.99       100
weighted avg       0.99      0.99      0.99       100

