In [131]:
import pandas as pd
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
# plot
import matplotlib.pyplot as plt


In [132]:
# iris data for flower species 
df = pd.read_csv('iris.csv')

In [133]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [135]:
# For each class there is equal representation
df.Species.value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [136]:
# No of classes 
df.Species.nunique()

3

In [137]:
# Class names 
df.Species.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [138]:
# There is null values in dataset
df.isna().sum().sum()

0

1. 3 Classes for species.
2. All have equal representation
3. There in no missing values

In [140]:
# We don't require the ID column
df.drop(['Id'],  inplace=True, axis=1)

### Train and test Split

In [142]:
X =df.drop('Species', axis=1) 
y = df.Species

In [143]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

0. 'Iris-setosa', 
1. 'Iris-versicolor'
2. 'Iris-virginica'

In [145]:
X_train,X_test,y_train,y_test=train_test_split(X,y_encoded, test_size=0.25,random_state=21)

In [146]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(112, 4) (38, 4) (112,) (38,)


**KNN**

In [148]:
from sklearn.neighbors import KNeighborsClassifier

In [149]:
# n_neighbors is the K value, and by default it is  5
# We have 3 classes in our dataset, so let's put the  k value as 4
knn_model = KNeighborsClassifier(n_neighbors=4)

In [150]:
res = knn_model.fit(X_train, y_train)

In [151]:
res

In [152]:
y_test_pred = knn_model.predict(X_test)
y_train_pred = knn_model.predict(X_train)

**Evaulation of Model**

In [154]:
print(f"Accuracy of Knn Model on test dataset is: {accuracy_score(y_test, y_test_pred)}")
print(f"Accuracy of Knn Model on train dataset is: {accuracy_score(y_train, y_train_pred)}")

Accuracy of Knn Model on test dataset is: 0.9473684210526315
Accuracy of Knn Model on train dataset is: 0.9642857142857143


This is an appropriate model.

**Let's see higher k-value (higher k values are bad because it distort the model)**

In [157]:
knn_model_20 = KNeighborsClassifier(n_neighbors=20)

In [158]:
knn_model_20.fit(X_train, y_train)

In [159]:
y_test_pred_20 = knn_model_20.predict(X_test)
y_train_pred_20 = knn_model_20.predict(X_train)

In [160]:
print(f"Accuracy of Knn Model on test dataset is: {accuracy_score(y_test, y_test_pred_20)}")
print(f"Accuracy of Knn Model on train dataset is: {accuracy_score(y_train, y_train_pred_20)}")

Accuracy of Knn Model on test dataset is: 0.868421052631579
Accuracy of Knn Model on train dataset is: 0.9821428571428571


**Higher K values can reduce the model performance and can result in overfitting model.**

In [162]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.93      0.93      0.93        15
           2       0.90      0.90      0.90        10

    accuracy                           0.95        38
   macro avg       0.94      0.94      0.94        38
weighted avg       0.95      0.95      0.95        38



# SVM (Support Vector Machine)

In [164]:
# imports
from sklearn.svm import SVC

In [165]:
svm = SVC()

In [166]:
svm.fit(X_train, y_train)

In [167]:
y_test_pred = svm.predict(X_test)
y_train_pred = svm.predict(X_train)

In [168]:
print(f"Accuracy of SVM Model on test dataset is: {accuracy_score(y_test, y_test_pred)}")
print(f"Accuracy of SVM Model on train dataset is: {accuracy_score(y_train, y_train_pred)}")

Accuracy of SVM Model on test dataset is: 0.9210526315789473
Accuracy of SVM Model on train dataset is: 0.9910714285714286


In [169]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.93      0.87      0.90        15
           2       0.82      0.90      0.86        10

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38



**After class work**
1. explore different test_size.
2. explore different random_state
3. explore differen k values for knn