**PREDICTING BODY MASS WITH KNN REGRESSOR**

In [None]:
#Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotnine import *
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
penguin = pd.read_csv("../input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv")

**GENERAL ANALYSIS**

In [None]:
#Observing the first rows
penguin.head(10)

In [None]:
#Statistical summary
penguin.describe()

In [None]:
#variable type
penguin.info()

In [None]:
#function showing null per column
def nulator(df):
    nulls = df.isnull().sum()
    return nulls
    
    

In [None]:
#applying function
nulator(penguin)

In [None]:
#taking the few nulls out
penguin = penguin.dropna()

In [None]:
#Checking the nulls are out now
nulator(penguin)

**VISUALIZING VARIABLE DISTRIBUTIONS**

In [None]:
#columns conversion into list in order to apply a visualization loop
variables = penguin.columns.to_list()
variables

In [None]:
#Loop returning the distribution of every variable
for value in variables:
    graph = ggplot(penguin) + geom_bar(color="black", fill="skyblue") + aes(x=value) + theme_bw() + labs(title= "variable distribution" + " " + value)
    print(graph)

In [None]:
penguin.species.unique()

In [None]:
penguin.island.unique()

In [None]:
#conversion into binary values for later application of predictors

dict_species = {"Adelie":"1",
               "Chinstrap":"2",
               "Gentoo":"3"}
penguin["species"].replace(dict_species, inplace=True)

In [None]:
dict_island = {'Torgersen': "1", 'Biscoe': "2", 'Dream':"3"}
penguin["island"].replace(dict_island, inplace=True)

In [None]:
dict_sex = {"FEMALE":"0", "MALE":"1"}
penguin["sex"].replace(dict_sex, inplace=True)


In [None]:
#Something was wrong in following steps when applying int conversion to sex column so we check what is going on:
penguin.sex.value_counts()

In [None]:
#finding and erasing wrong value
penguin[penguin["sex"] == "."]

In [None]:
penguin = penguin.drop([336], axis=0)

In [None]:
#int tipe is applied to all the non float columns
penguin["species"] = penguin["species"].astype("int")
penguin["island"] = penguin["island"].astype("int")

In [None]:
penguin["sex"] = penguin["sex"].astype("int")

In [None]:
#Based on the data, we come up with the idea of creating a predictor for body mass. 
#We set X and Y 
Y = penguin["body_mass_g"]
X = penguin.drop("body_mass_g", axis=1)

In [None]:
#We show the correlation matrix. If too variables are too correlated to one another, we should erase one of them

matriz_correlaciones = penguin.corr(method="pearson")
n_ticks = len(penguin.columns)
plt.figure(figsize=(9,9))
plt.xticks(range(n_ticks), penguin.columns, rotation="vertical")
plt.yticks(range(n_ticks), penguin.columns)
plt.colorbar(plt.imshow(matriz_correlaciones, interpolation="nearest", vmin=-1., vmax=1, cmap=plt.get_cmap("Oranges")))
plt.title("Matriz de correlación de Pearson")

In [None]:
#further look on correlations:

correlaciones_target = matriz_correlaciones.values[ -1, : -1]
indices_inversos =  abs(correlaciones_target[ : ]).argsort()[ : : -1]
diccionario = {}
for nombre, correlacion in zip( X.columns[indices_inversos], list(correlaciones_target[indices_inversos] ) ):
    diccionario[nombre] = correlacion
pd.DataFrame.from_dict(diccionario, orient='index', columns=['Correlación con la target'])

In [None]:
#Very low correlation of species and island with the target (bopdy mass), we take them out
X = X.drop(["species", "island"], axis=1)

In [None]:
#data standardizing

obj_escalar = StandardScaler()
X_standardized = obj_escalar.fit_transform(X)

**MODEL DESIGN**

In [None]:
#Train-test division
X_train, X_test, y_train, y_test = train_test_split(X_standardized, Y, test_size=.3, random_state=4)

In [None]:
#KNN model starting
knn = KNeighborsRegressor(n_neighbors=5)

# Fitting
knn.fit(X_train, y_train)

#Score checking
knn.score(X_test, y_test)

In [None]:
def plot_complexity_curve(k_list, knn_model, x_train, x_test, y_train, y_test):
    
    train_scores = []
    test_scores = []
    
    # For each k
    for k in k_list:
        # Initialize, fit, predict
        knn = knn_model(k)
        
        knn.fit(x_train, y_train)
        
        train_scores.append(knn.score(x_train, y_train))
        test_scores.append(knn.score(x_test, y_test))

    # Plot
    fig, ax = plt.subplots()
    
    ax.plot(k_list, train_scores, label='Training Accuracy', color='red')
    ax.plot(k_list, test_scores, label='Testing Accuracy', color='black')

    ax.set(title='k-NN with Different Values for $k$',
           xlabel='Number of Neighbors',
           ylabel='Accuracy')
    
    ax.legend()

In [None]:
#We check the curves in order to find out what would be the best value for K.
neighbors = np.arange(1, 50)
plot_complexity_curve(neighbors, KNeighborsRegressor, X_train, X_test, y_train, y_test)

In [None]:
#We confirm what it the highest accuracy value given 50 different K values:
n=0
neighbors_value = {}
for value in range(1,50):
    n+=1
    
    # Initialize kNN
    knn = KNeighborsRegressor(n_neighbors=n)

    # Fit and score
    knn.fit(X_train, y_train)
    neighbors_value[knn.score(X_test, y_test)] = n

In [None]:
print(max(neighbors_value))

In [None]:
print(neighbors_value.get(max(neighbors_value)))

In [None]:
#We create the best model given an optimal k of 10

#KNN model starting
knn_best = KNeighborsRegressor(n_neighbors=10)

# Fitting
knn_best.fit(X_train, y_train)

#Score checking
knn_best.score(X_test, y_test)

**THE MODEL RETURNED AN ACCURACY OF 85% WHICH CAN BE CONSIDERED GOOD**