### This dataset is an extremely interesting dataset for Data visualization

### Basic Import statements

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Additional imports

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [None]:
df = pd.read_csv('/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv')
df2 = pd.read_csv('/kaggle/input/palmer-archipelago-antarctica-penguin-data/penguins_lter.csv')
df.head()

<img src="https://pbs.twimg.com/media/EaAXQn8U4AAoKUj.jpg:large" alt="penguin beak culmen" style='width: 450px;'></img>

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(df['culmen_length_mm'], df['flipper_length_mm'], color='orange', alpha=0.75)
plt.xlabel("Culmen Length", size=16)
plt.ylabel("Flipper Length", size=16)
plt.title("Scatter Plot", size=22)

## Seaborn Heatmap

In [None]:
fig, ax = plt.subplots(figsize=(12,6))
sns.heatmap(df.corr(), annot=True, ax=ax)

In [None]:
fig, ax = plt.subplots(figsize =(8,5))
ax.hist(df[df.columns[2]], rwidth=0.8, alpha=0.55)
ax.axvline(df[df.columns[2]].median(), color='red')
ax.axvline(df[df.columns[2]].mean())
plt.xlabel("Culmen Length")
plt.ylabel("Frequency")
plt.title("Histogram", size=22)

## Seaborn Pairplot

In [None]:
sns.pairplot(df, hue='species')

> ### Does a longer culmen correlate with body mass?

### From the above plot, we can see some good opportunities for clustering applications
#### Target:
- **Split the data in 80:20 ratio**

- **Work on training data with cross validation**

- **Design a clustering model to check predictions**

## Data cleaning

In [None]:
df.isnull().sum()

In [None]:
for col in list(df.columns):
    if df[col].dtypes=='object':
        print('Levels of',col,':',df[col].value_counts().keys())

### Sex has an erroneous value

In [None]:
df['sex'].replace('.',np.nan, inplace=True)
df.isnull().sum()

### **Filling Missing values**
1. ### Filling with median in case of numerical feature
2. ### Filling with mode in case of categorical feature

In [None]:
for col in list(df.columns):
    if df[col].dtypes=='object':
#         print('Max (mode) of',col,'****',type(df[col].mode()[0]))
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)
print(df.isnull().sum())
print("\nAll Null values eliminated")

### Splitting data

In [None]:
df['species'].value_counts()

### **Must ensure stratified splitting of data**

In [None]:
X = df.drop(['species'], axis=1)
Y = df['species']
print(X.shape, Y.shape)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=0.85, stratify=Y, random_state=42)
ytrain2 = pd.get_dummies(ytrain.values, drop_first=True)
ytest2 = pd.get_dummies(ytest.values, drop_first=True)
print(xtrain.shape, ytest.shape)

### Building KNN model on "culmen length" vs "body mass"

In [None]:
acc_score = []
for i in range(3,15,2):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(xtrain['culmen_length_mm'].values.reshape(-1,1), ytrain.values)
    ypreds = knn.predict(xtest['culmen_length_mm'].values.reshape(-1,1))
    acc_score.append(accuracy_score (ypreds, ytest))

In [None]:
plt.figure(figsize=(12,8))
plt.plot([i for i in range(3,15,2)],acc_score,  color='orange')
plt.xlabel("Neighbors considered", size=16)
plt.ylabel("Accuracy score", size=16)
plt.title("Elbow plot", size=22)

In [None]:
for i,k in zip(ypreds[:25], ytest.values[:25]):
    print(f"Predicted: {i}\t Actual: {k}")

### Let's try using a Decision Tree

In [None]:
dtree = DecisionTreeClassifier(max_depth=5)
dtree

In [None]:
mapping = {'Adelie':1,
          'Gentoo':2,
          'Chinstrap':3}
ytrain = ytrain.map(mapping)
ytest = ytest.map(mapping)

In [None]:
def resolve_index_cols(df):
    df.reset_index(inplace=True)
    df.drop(['index','island','sex'], axis=1, inplace=True)
    return df

In [None]:
xtrain = resolve_index_cols(xtrain)
xtest = resolve_index_cols(xtest)

In [None]:
dtree.fit(xtrain, ytrain)

In [None]:
ypreds= dtree.predict(xtest)

### Let's test on a varying number of depth levels

In [None]:
xr = range(3,11)
scores = []
for i in xr:
    dt = DecisionTreeClassifier(max_depth=i)
    dt.fit(xtrain, ytrain)
    print(f"Fit DTree with depth {i}")
    preds = dt.predict(xtest)
    scores.append(accuracy_score(preds, ytest))


In [None]:
plt.figure(figsize=(12,6))
plt.plot(xr,scores, color='orange')
plt.xlabel("Max Depth", size=16)
plt.ylabel("Accuracy score", size=16)
plt.title("Elbow plot", size=22)

### Can LightGBM improve upon this?

In [None]:
lgc = LGBMClassifier(n_estimators=100)

In [None]:
xr = range(5,151,10)
lgc_scores = []

for i in xr:
    lgc2 = LGBMClassifier(n_estimators=i)
    lgc2.fit(xtrain, ytrain)
    print(f"Fit classifier with estimators {i}")
    lg_preds = lgc2.predict(xtest)
    lgc_scores.append(accuracy_score(lg_preds, ytest))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(xr,lgc_scores, color='orange')
plt.xlabel("Max Depth", size=16)
plt.xticks(np.arange(25,151,25))
plt.ylabel("Accuracy score", size=16)
plt.title("Elbow plot", size=22)

In [None]:
lgc_scores

### <font color='green'>Observations:</font>
- **LGBM and Decision Tree both reach the same peak accuracy score**
- **Ensemble is easily comparable to a Decision Tree given the nature of the dataset**