# Step-1: Business problem

Our goal is create a model that can help predict a species of a penguin based on physical attributes, then we can use that model to help reserchers classify penguins in the field, insted of nedding an experienced blologist.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [6]:
df=pd.read_csv("penguins_size.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


# step-2: Data Understanding

Penguins_size.csv: simplified data from original prnguin data set

* species: penguin species (Adelie, chinstrap or Gentoo)
* culmen_lenth_mm:culmen lenth(mm)
* culmen_depth_mm: culmen depth(mm)
* Flipper_lenth_mm:flipper lenth in mm
* body_mass_g:body mass(g)
* island : island name(Dream,Trgersen, or Biscoe)
* Sex: Penguin sex

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


# step-3: Data Preprocessing

In [12]:
df.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [13]:
#Dropping the missing values

df=df.dropna()

In [17]:
df=df[df['sex']!='.']
df.shape

(333, 7)

# X & Y

In [20]:
x=pd.get_dummies(df.drop('species',axis=1),drop_first=True)
y=df['species']

# Train test split

In [22]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=3,random_state=0)

# step-4,5:Modelling and Evaluation

In [24]:
#Random forest classifier with default parameters
from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier(random_state=0)
model.fit(x_train,y_train)

#prediction

ypred_train=model.predict(x_train)
ypred_test=model.predict(x_test)

#Evaluation

#1.Train test accuracy

from sklearn.metrics import accuracy_score
print("Train accuracy:",accuracy_score(ypred_train,y_train))
print("Test accuracy:",accuracy_score(ypred_test,y_test))

#2 .Cross validation score

from sklearn.model_selection import cross_val_score
scores=cross_val_score(model,x,y,cv=5)
print("Cross Validation Score:",scores.mean())

Train accuracy: 1.0
Test accuracy: 1.0
Cross Validation Score: 0.9849841700587969


# Importans of each feature given by this model

In [25]:
model.feature_importances_

array([0.33184081, 0.17872902, 0.27204886, 0.08056588, 0.1093975 ,
       0.02240916, 0.00500878])

In [26]:
pd.DataFrame(index=x.columns,data=model.feature_importances_,columns=["Feature Importance"])



Unnamed: 0,Feature Importance
culmen_length_mm,0.331841
culmen_depth_mm,0.178729
flipper_length_mm,0.272049
body_mass_g,0.080566
island_Dream,0.109397
island_Torgersen,0.022409
sex_MALE,0.005009


# Hyper Parameter tunning

In [29]:
from sklearn.model_selection import GridSearchCV

#model 
estimator = RandomForestClassifier(random_state=0)

#parameters (which you want to tune and identify the best)

param_grid = {'n_estimators':list(range(1,101))}

grid = GridSearchCV(estimator,param_grid,scoring='accuracy',cv=5)

grid.fit(x_train,y_train)

grid.best_params_

{'n_estimators': 7}

# Random Forest model With best Hyperparameters

In [33]:
model=RandomForestClassifier(n_estimators=8,random_state=0)

model.fit(x_train,y_train)

ypred_train=model.predict(x_train)
ypred_test=model.predict(x_test)

print("Train Accuracy",accuracy_score(ypred_train,y_train))
print("Test Accuracy",accuracy_score(ypred_test,y_test))

scores=cross_val_score(model,x,y,cv=5)
print("Cross_val_score:",scores.mean())

Train Accuracy 0.9939393939393939
Test Accuracy 1.0
Cross_val_score: 0.9819990954319312
