# Problem Type:
Predict the Penguin

### Predictors:
- Sex
- Culmen Length (mm)
- Culmen Depth (mm)
- Flipper Length (mm)
- Body Mass (g)
- Island

### Target - Species:
- Adelie
- Chinstrap
- Gentoo

<img src="https://allisonhorst.github.io/palmerpenguins/reference/figures/lter_penguins.png" alt="Drawing" width="400">

# Importing libraries and loading data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("../input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv")

In [None]:
data.head(3)

In [None]:
print("Shape of Dataset is", data.shape)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().any()

In [None]:
# Missing values
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_cols = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_cols = mis_val_table_cols[mis_val_table_cols.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]))   
        print("There are " + str(mis_val_table_cols.shape[0])+" columns that have missing values.")
        return mis_val_table_cols

In [None]:
miss_values= missing_values_table(data)
miss_values.style.background_gradient(cmap='plasma')

# EDA and Missing Values Treatment

In [None]:
data['species'].value_counts()
sns.countplot(data['species'],palette = "gist_ncar")

In [None]:
data['island'].value_counts()
sns.countplot(data['island'], palette = "cubehelix_r")

In [None]:
data['sex'].value_counts()

In [None]:
data['sex'].fillna(data['sex'].mode()[0],inplace=True)
data["sex"].replace({".": "FEMALE"}, inplace=True)
data['sex'].value_counts()

In [None]:
sns.countplot(data['sex'],palette="cubehelix")

In [None]:
data['culmen_length_mm'].groupby(data['sex']).mean()

In [None]:
data.groupby(['sex', 'species'])['culmen_length_mm'].median()

In [None]:
col_to_imput = ['culmen_length_mm', 'culmen_depth_mm','flipper_length_mm', 'body_mass_g']
for item in col_to_imput:
    data[item].fillna(data[item].median(),inplace=True)

In [None]:
missing_values= missing_values_table(data)
missing_values.style.background_gradient(cmap='Reds')

In [None]:
sns.heatmap(data.isnull(), yticklabels= False)

In [None]:
sns.heatmap(data.corr(), annot = True, cmap="magma" )

In [None]:
sns.pairplot(data,hue='species')

In [None]:
sns.pairplot(data,hue='sex', palette="Dark2" )

In [None]:
fig,axes=plt.subplots(2,2,figsize=(10,10))
sns.boxplot(x=data.species,y=data.flipper_length_mm,hue = data.sex, ax=axes[0,0])
sns.boxplot(x=data.species,y=data.culmen_length_mm,hue = data.sex, ax=axes[0,1])
sns.boxplot(x=data.species,y=data.culmen_depth_mm,hue = data.sex, ax=axes[1,0])
sns.boxplot(x=data.species,y=data.body_mass_g,hue = data.sex, ax=axes[1,1])

In [None]:
#distribution plot
fig,axes=plt.subplots(2,2,figsize=(10,10))
sns.distplot(data.flipper_length_mm,ax=axes[0,0])
sns.distplot(data.culmen_length_mm,ax=axes[0,1])
sns.distplot(data.culmen_depth_mm,ax=axes[1,0])
sns.distplot(data.body_mass_g,ax=axes[1,1])

In [None]:
col_list = ['culmen_length_mm', 'culmen_depth_mm','flipper_length_mm', 'body_mass_g']
col = 'species'
row = 'sex'
for i in col_list:
    grid = sns.FacetGrid(data, col=col, row=row, size=2.2, aspect=1.6)
    grid.map(plt.hist, i, alpha=.5, bins=20)
    grid.add_legend();

In [None]:
sns.FacetGrid(data = data,row = "island", col = "sex").map(plt.scatter ,"flipper_length_mm","body_mass_g").add_legend()

In [None]:
sns.FacetGrid(data = data,row = "island", col = "sex").map(plt.scatter ,'culmen_length_mm', 'culmen_depth_mm' ).add_legend()

# Importing Model Libraries and Model Building

In [None]:
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Label Encoding the Data

In [None]:
le = LabelEncoder() 
  
data['sex']= le.fit_transform(data['sex']) 
data['island']= le.fit_transform(data['island'])
data['species']= le.fit_transform(data['species'])

In [None]:
data.head()

In [None]:
#defining logistic regression model
def logreg(X,y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
    lr = LogisticRegression()
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_val)
    print('Accuracy : ', accuracy_score(y_val, y_pred))
    print('F1 Score : ', f1_score(y_val, y_pred, average = 'weighted'))
    print('Precision : ', precision_score(y_val, y_pred, average = 'weighted'))
    print('Recall : ', recall_score(y_val, y_pred, average = 'weighted'))

In [None]:
#defining decision tree model
def DesTre(X,y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
    dtc = DecisionTreeClassifier(criterion='entropy')
    dtc.fit(X_train,y_train)
    y_pred = dtc.predict(X_val)
    print('Accuracy : ', accuracy_score(y_val, y_pred))
    print('F1 Score : ', f1_score(y_val, y_pred, average = 'weighted'))
    print('Precision : ', precision_score(y_val, y_pred, average = 'weighted'))
    print('Recall : ', recall_score(y_val, y_pred, average = 'weighted'))

In [None]:
#defining random forest classifer model
def rfc(X,y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
    rfc = RandomForestClassifier()
    rfc.fit(X_train,y_train)
    y_pred = rfc.predict(X_val)
    print('Accuracy : ', accuracy_score(y_val, y_pred))
    print('F1 Score : ', f1_score(y_val, y_pred, average = 'weighted'))
    print('Precision : ', precision_score(y_val, y_pred, average = 'weighted'))
    print('Recall : ', recall_score(y_val, y_pred, average = 'weighted'))

In [None]:
#defining k neighour model
def knn(X,y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
    knn = KNeighborsClassifier(n_neighbors=10,weights='distance',n_jobs=100)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_val)
    print('Accuracy : ', accuracy_score(y_val, y_pred))
    print('F1 Score : ', f1_score(y_val, y_pred, average = 'weighted'))
    print('Precision : ', precision_score(y_val, y_pred, average = 'weighted'))
    print('Recall : ', recall_score(y_val, y_pred, average = 'weighted'))

## Species Model

In [None]:
X = data.drop('species', axis = 1)
y = data['species']

In [None]:
logreg(X,y)

In [None]:
DesTre(X,y)

In [None]:
rfc(X,y)

In [None]:
knn(X,y)

# Gender prediction

In [None]:
X_gender = data.drop('sex', axis = 1)
y_gender = data['sex']

In [None]:
logreg(X_gender,y_gender)

In [None]:
DesTre(X_gender,y_gender)

In [None]:
rfc(X_gender,y_gender)

In [None]:
knn(X_gender,y_gender)

# Island prediction

In [None]:
X_island = data.drop('island', axis = 1)
y_island = data['island']

In [None]:
logreg(X_island,y_island)

In [None]:
DesTre(X_island,y_island)

In [None]:
rfc(X_island,y_island)

In [None]:
knn(X_island,y_island)

# The End