In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, learning_curve, LearningCurveDisplay
from sklearn.metrics import mean_squared_error, f1_score

from numpy import linspace, mean
import plotly.express as px

### Carregar e preparar dados - Classificação

In [2]:
# Carregar o dataset
df = pd.read_csv('./datasets/fruit_quality.csv')

In [3]:
# Visualizar estrutura
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   A_id         4000 non-null   int64  
 1   Size         4000 non-null   float64
 2   Weight       4000 non-null   float64
 3   Sweetness    4000 non-null   float64
 4   Crunchiness  4000 non-null   float64
 5   Juiciness    4000 non-null   float64
 6   Ripeness     4000 non-null   float64
 7   Acidity      4000 non-null   float64
 8   Quality      4000 non-null   object 
dtypes: float64(7), int64(1), object(1)
memory usage: 281.4+ KB


In [4]:
# Visualizar DF
df.head(5)

Unnamed: 0,A_id,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,good
1,1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,good
2,2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad
3,3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,good
4,4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good


In [5]:
# Ajustar Dataframe

# Remover a coluna 'A_id'
df.drop('A_id', axis=1, inplace=True)

# Transformar coluna 'Quality' em numérica
df['Quality'] = (df['Quality'] == 'good').astype(int)
df.head(5)

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,1
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,1
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,0
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,1
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,1


### Treinar modelo de KNN com mudança de K

In [6]:
# Separar X e y
X = df.drop('Quality', axis=1)
y = df['Quality']

In [7]:
# Separar Treino e Teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=51)

In [8]:
# Faz um loop para treinar o modelo com diversas quantidades de vizinhos (K)
scores_train = []
scores_test = []
for i in range(1, 20, 2):
  clf = KNeighborsClassifier(n_neighbors=i)
  clf.fit(X_train, y_train)
  y_train_pred = clf.predict(X_train)
  y_test_pred = clf.predict(X_test)
  scores_train.append(f1_score(y_train, y_train_pred))
  scores_test.append(f1_score(y_test, y_test_pred))
  print(f'{i}: F1_Train: {f1_score(y_train, y_train_pred)} F1_Test: {f1_score(y_test, y_test_pred)}')

1: F1_Train: 1.0 F1_Test: 0.8740617180984154
3: F1_Train: 0.9452103216684341 F1_Test: 0.8964941569282137
5: F1_Train: 0.9262266148958701 F1_Test: 0.8985985160758451
7: F1_Train: 0.921935711762628 F1_Test: 0.8929460580912864
9: F1_Train: 0.9186704384724187 F1_Test: 0.8913043478260869
11: F1_Train: 0.9178130511463845 F1_Test: 0.8887029288702929
13: F1_Train: 0.9115983026874116 F1_Test: 0.8881469115191987
15: F1_Train: 0.9133969600565571 F1_Test: 0.890728476821192
17: F1_Train: 0.9099258212645708 F1_Test: 0.8883333333333333
19: F1_Train: 0.9115885875308207 F1_Test: 0.885


In [9]:
# Criar um Dataframe com os scores
df_results = pd.DataFrame({'k': range(1, 20, 2), 'train': scores_train, 'test': scores_test})

In [10]:
df_results

Unnamed: 0,k,train,test
0,1,1.0,0.874062
1,3,0.94521,0.896494
2,5,0.926227,0.898599
3,7,0.921936,0.892946
4,9,0.91867,0.891304
5,11,0.917813,0.888703
6,13,0.911598,0.888147
7,15,0.913397,0.890728
8,17,0.909926,0.888333
9,19,0.911589,0.885


In [11]:
# Mostrar Plot com Scores
xaxis = x = [i for i in range(1, 20, 2)]
fig = px.line(df_results, x='k', y=['train', 'test'], title='KNN Performance - Mudando K')
fig.update_xaxes(tickvals=xaxis, ticktext=[str(i) for i in x])
fig.show()