Coding a Predict model to predict Iris plants into three species

In [76]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Importing train dataset and taking a first look at it

dataset = pd.read_csv('iris.csv')

dataset.info() # There is no column with na values

In [78]:
# Taking a look at the first lines of the dataset
# According to Kaggle, the column species is the one that we gonna use to train our predict model
print(dataset.head())
dataset.groupby(['species']).count()

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,50,50,50,50
versicolor,50,50,50,50
virginica,50,50,50,50


In [79]:
# Removing columns that will not be used for model training, those will be stored as X and y, y beeing the target values
X = dataset.drop(['species'], axis=1).values
y = dataset['species'].values

# Separating dataset into train and test variables. Since it is a small amount of data, we will consider 30% as test size.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

Creating an instance of the predicting model KNeighborsClassifier.

In [80]:
# Getting the best number of neighbors:
from sklearn.model_selection import cross_val_score

k_values = range(1, 21)  # Testar n_neighbors de 1 a 20

# List to store results
scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    # Cross validation with 5 divisions
    score = cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()
    scores.append(score)

# Find the k value with best performance
best_k = k_values[np.argmax(scores)]
print(f"Best value for n_neighbors: {best_k}") # 6

Best value for n_neighbors: 6


In [81]:
# Creating instance of KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=6)

# Fitting out model using the train variables
knn.fit(X_train, y_train)

# Predicting our data from the test variables
knn_prediction = knn.predict(X_test)

In [82]:
# What was the Accuracy?
knn_accuracy = accuracy_score(y_test, knn_prediction)
print(f'Accuracy for KNeighbors: {format(np.round(knn_accuracy * 100, 2))}%')

Accuracy for KNeighbors: 95.56%


We can see that KNeighbors had a 95.56% Accuracy performance! Lets do it on a test dataset

Since Kaggle didn't give us a test dataset, lets create one!

In [83]:
# Getting max and min of each column
sepal_length_max = np.max(dataset['sepal_length'])
sepal_length_min = np.min(dataset['sepal_length'])

sepal_width_max = np.max(dataset['sepal_width'])
sepal_width_min = np.min(dataset['sepal_width'])

petal_length_max = np.max(dataset['petal_length'])
petal_length_min = np.min(dataset['petal_length'])

petal_width_max = np.max(dataset['petal_width'])
petal_width_min = np.min(dataset['petal_width'])

In [84]:
# Generating random arrays of each column to use as a test dataframe
new_sepal_length = np.random.uniform(low=sepal_length_min, high=sepal_length_max, size=len(dataset))
new_sepal_width = np.random.uniform(low=sepal_width_min, high=sepal_width_max, size=len(dataset))
new_petal_length = np.random.uniform(low=petal_length_min, high=petal_length_max, size=len(dataset))
new_petal_width = np.random.uniform(low=petal_width_min, high=petal_width_max, size=len(dataset))

In [85]:
# Creating a new DataFrame
iris_test = pd.DataFrame()

# Adding columns to it
iris_test['sepal_length'] = new_sepal_length
iris_test['sepal_width'] = new_sepal_width
iris_test['petal_length'] = new_petal_length
iris_test['petal_width'] = new_petal_width

In [None]:
# Predicting with KNeighbors previously trained
predict = knn.predict(iris_test)

In [87]:
# Add the prediction column into the test dataset
iris_test['prediction'] = predict
iris_test

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,prediction
0,5.242118,3.207176,3.846029,1.937121,versicolor
1,4.338446,4.267189,3.998901,2.300124,versicolor
2,5.692844,4.209714,1.352423,1.104617,setosa
3,4.976937,4.000640,1.257103,0.539279,setosa
4,7.820371,3.332315,4.194216,1.341267,versicolor
...,...,...,...,...,...
145,5.369977,2.848395,4.530157,1.084069,versicolor
146,7.796791,2.021726,6.272152,1.115030,virginica
147,6.062522,3.999782,1.727949,1.493216,setosa
148,4.322686,3.075497,3.004296,0.973652,versicolor
