In [210]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split

In [211]:
df = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")

In [212]:
#Printing first 5 rows
df.head(60)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [213]:
# data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


## Here there are 5 columns with no null values and their data types are given

In [214]:
# checking for Null Values
df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [215]:
# Data Encoding, converting categorical values to numerical
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])

In [216]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [217]:
# Split Data
x = df.drop(columns = ['species'])
y = df['species']

In [218]:
#Scaling Data Features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [219]:
# train test split
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,train_size=0.2,random_state=42)

In [220]:
#Model
model = KNeighborsClassifier()
model.fit(x_train,y_train)


In [221]:
y_pred = model.predict(x_test)

In [222]:
#Evaluation on testing
report = classification_report(y_test,y_pred)
print("Classification report :\n",report)

Classification report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.92      0.87      0.89        39
           2       0.88      0.92      0.90        38

    accuracy                           0.93       120
   macro avg       0.93      0.93      0.93       120
weighted avg       0.93      0.93      0.93       120



In [223]:
#Evaluation on training
y_pred2 = model.predict(x_train)
report = classification_report(y_train,y_pred2)
print("Classification report :\n",report)

Classification report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.92      1.00      0.96        11
           2       1.00      0.92      0.96        12

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



# Accuracy

here the accuracy score is 93% for the testing
and 97% for training, this shows the model has slighty under performed


In [224]:
# testing manually

In [225]:
values = [5.2,2.7,3.9,1.4]

y_pred3 = model.predict([values])

In [226]:
print(le.inverse_transform(y_pred3))

['virginica']


In [229]:
from sklearn.ensemble import RandomForestClassifier

#Model random forest 
model = RandomForestClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
#Evaluation on testing
report = classification_report(y_test,y_pred)
print("Classification report :\n",report)


Classification report :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.92      0.87      0.89        39
           2       0.88      0.92      0.90        38

    accuracy                           0.93       120
   macro avg       0.93      0.93      0.93       120
weighted avg       0.93      0.93      0.93       120



In [230]:
values = [5.2,2.7,3.9,1.4]

y_pred3 = model.predict([values])

print(le.inverse_transform(y_pred3))

['virginica']
