In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#importing data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
#importing dataset in penguin_data DataFrame of Pandas
penguin_data = pd.read_csv('/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv')
penguin_data.head()

In [None]:
rows,columns = penguin_data.shape
print('No. of rows:',rows,' Columns:',columns)

In [None]:
penguin_data.info()

In [None]:
#Descriptive Statistics
penguin_data.describe()

In [None]:
#checking for any missing values in the dataset
penguin_data.isnull().sum()[penguin_data.isnull().sum()>0]

In [None]:
most_frequent=penguin_data.mode()['sex'].values[0]
penguin_data.sex.replace('.',most_frequent,inplace=True)

In [None]:
#replacing nan values with male
penguin_data.sex.fillna(most_frequent,inplace=True)

In [None]:
#numerical columns to be imputed with mean_values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
numerical_features = [col for col in penguin_data.columns if penguin_data[col].dtypes in ['int64','float64']]
penguin_data[numerical_features]=imputer.fit_transform(penguin_data[numerical_features])

In [None]:
#now checking if we have any missing values
penguin_data.isnull().sum()

**We imputed the missing values, now lets explore the data**

In [None]:
sns.countplot(x='sex',data=penguin_data,hue='species')
#all species has similar number of male and female penguins

In [None]:
#correlation between different columns in dataset
sns.heatmap(penguin_data.corr(),annot=True)

In [None]:
sns.pairplot(penguin_data,kind='scatter',palette='cubehelix',dropna=True)

In [None]:
#countplot
sns.countplot(x='species',data=penguin_data,palette='cubehelix')

In [None]:
encoding_sex = pd.get_dummies(penguin_data.sex,drop_first=True)
encoding_sex
#0 refers to female and 1 to male

In [None]:
#dropping sex column and concatenating encoder_sex with the original penguin_data
penguin_data.drop('sex',axis=1,inplace=True)
penguin_data = pd.concat([penguin_data,encoding_sex],axis=1)

In [None]:
encoding_island = pd.get_dummies(penguin_data.island,drop_first=True)
encoding_island
#(0,1) for Torgersen, (1,0) for Dream

In [None]:
#dropping island column and concatenating encoder_island with the original penguin_data
penguin_data.drop('island',axis=1,inplace=True)
penguin_data = pd.concat([penguin_data,encoding_island],axis=1)

In [None]:
features = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm',
       'body_mass_g', 'MALE', 'Dream', 'Torgersen']

In [None]:
#Finally it's time to train and test the model 
X = penguin_data[features].copy()
y = penguin_data['species'].copy()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)


In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
predictions=rfc.predict(X_test)

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix,classification_report
cm=confusion_matrix(y_test,predictions)
cl_report = classification_report(y_test,predictions)

In [None]:
#printing classification report
print(cl_report)

In [None]:
#using heatmap to see the confusion matrix
sns.heatmap(cm,annot=True,cbar=False)

In [None]:
#in the end checking the score of our model
rfc.score(X_test,y_test)