In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
sns.set()
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

1.**Loading** **dataset**

In [2]:
data_iris = pd.read_excel("iris (1).xls")
data_iris.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


**2.Preprocessing**

In [3]:
data_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
data_iris.isna().sum()

SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64

In [5]:
#Handling Missing Values
data_iris['SL'] = data_iris['SL'].fillna(data_iris['PL'].mean())
data_iris['SW'] = data_iris['SW'].fillna(data_iris['PL'].mean())
data_iris['PL'] = data_iris['PL'].fillna(data_iris['PL'].mean())

In [6]:
data_iris.isna().sum()

SL                0
SW                0
PL                0
PW                0
Classification    0
dtype: int64

In [7]:
#Label Encoding
data_iris.Classification.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [9]:
label_en = LabelEncoder()
data_iris['Classification'] = label_en.fit_transform(data_iris['Classification'])

In [10]:
data_iris

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.10000,3.5,1.40000,0.2,0
1,4.90000,3.0,1.40000,0.2,0
2,3.75625,3.2,1.30000,0.2,0
3,4.60000,3.1,1.50000,0.2,0
4,5.00000,3.6,1.40000,0.2,0
...,...,...,...,...,...
145,6.70000,3.0,5.20000,2.3,2
146,6.30000,2.5,5.00000,1.9,2
147,6.50000,3.0,3.75625,2.0,2
148,6.20000,3.4,5.40000,2.3,2


In [11]:
data_iris.columns

Index(['SL', 'SW', 'PL', 'PW', 'Classification'], dtype='object')

In [12]:
y = data_iris['Classification']
x = data_iris.drop('Classification',axis=1)

In [13]:
#Feature engineering
x['mean'] = x[['SW', 'SL', 'PW', 'PL']].mean(axis = 1)
x['sum'] = x[['SW', 'SL', 'PW', 'PL']].sum(axis = 1)
x['skew'] = x[['SW', 'SL', 'PW', 'PL']].skew(axis = 1)
x['kurt'] = x[['SW', 'SL', 'PW', 'PL']].kurt(axis = 1)

In [14]:
x.columns

Index(['SL', 'SW', 'PL', 'PW', 'mean', 'sum', 'skew', 'kurt'], dtype='object')

In [16]:
#Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 2, test_size = 0.3)

In [19]:
#Standard scalimg
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

**3. Find out which classification model gives the best result to predict iris species.(also do random forest algorithm).**

In [21]:
#Logistic Regression Algorithm
logit_model = LogisticRegression()
logit_model.fit(x_train, y_train)
y_pred = logit_model.predict(x_test)

In [22]:
print("Accuracy is: ", accuracy_score(y_test,y_pred))
print("Precision is: ", precision_score(y_test,y_pred, average = 'micro'))
print("Recall is: ", recall_score(y_test,y_pred,  average = 'micro'))
print("F1 score is: ", f1_score(y_test,y_pred,  average = 'micro'))

Accuracy is:  0.9333333333333333
Precision is:  0.9333333333333333
Recall is:  0.9333333333333333
F1 score is:  0.9333333333333333


In [23]:
confusion_matrix(y_test,y_pred)

array([[15,  2,  0],
       [ 0, 14,  1],
       [ 0,  0, 13]])

In [25]:
#KNN
acc_values = []
neighbors = np.arange(3,15)
for k in neighbors:
    classifier = KNeighborsClassifier(n_neighbors = k, metric='minkowski')
    classifier.fit(x_train, y_train)
    y_pred =  classifier.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    acc_values.append(acc)

In [26]:
acc_values


[0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333,
 0.9333333333333333]

In [28]:
classifier = KNeighborsClassifier(n_neighbors = 5, metric='minkowski')
classifier.fit(x_train, y_train)
y_pred =  classifier.predict(x_test)   

In [29]:
print("Accuracy is: ", accuracy_score(y_test,y_pred))
print("Precision is: ", precision_score(y_test,y_pred, average = 'micro'))
print("Recall is: ", recall_score(y_test,y_pred,  average = 'micro'))
print("F1 score is: ", f1_score(y_test,y_pred,  average = 'micro'))

Accuracy is:  0.9333333333333333
Precision is:  0.9333333333333333
Recall is:  0.9333333333333333
F1 score is:  0.9333333333333333


In [30]:
confusion_matrix(y_test,y_pred)

array([[16,  1,  0],
       [ 0, 14,  1],
       [ 0,  1, 12]])

In [31]:
#Decision Tree Algorithm
dt_model =DecisionTreeClassifier()
dt_model.fit(x_train,y_train)
y_pred=dt_model.predict(x_test)

In [32]:
print("Accuracy is: ", accuracy_score(y_test,y_pred))
print("Precision is: ", precision_score(y_test,y_pred, average = 'micro'))
print("Recall is: ", recall_score(y_test,y_pred,  average = 'micro'))
print("F1 score is: ", f1_score(y_test,y_pred,  average = 'micro'))

Accuracy is:  0.9555555555555556
Precision is:  0.9555555555555556
Recall is:  0.9555555555555556
F1 score is:  0.9555555555555556


In [33]:
confusion_matrix(y_test,y_pred)

array([[17,  0,  0],
       [ 0, 14,  1],
       [ 0,  1, 12]])

In [34]:
#SVM
svm_linear = SVC(kernel = 'linear')
svm_linear.fit(x_train, y_train)
y_pred = svm_linear.predict(x_test)

In [35]:
print("Accuracy is: ", accuracy_score(y_test,y_pred))
print("Precision is: ", precision_score(y_test,y_pred, average = 'micro'))
print("Recall is: ", recall_score(y_test,y_pred,  average = 'micro'))
print("F1 score is: ", f1_score(y_test,y_pred,  average = 'micro'))

Accuracy is:  0.9777777777777777
Precision is:  0.9777777777777777
Recall is:  0.9777777777777777
F1 score is:  0.9777777777777777


In [36]:
confusion_matrix(y_test,y_pred)

array([[17,  0,  0],
       [ 0, 14,  1],
       [ 0,  0, 13]])

In [37]:
#Random forest
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

In [38]:
print("Accuracy is: ", accuracy_score(y_test,y_pred))
print("Precision is: ", precision_score(y_test,y_pred,  average = 'micro'))
print("Recall is: ", recall_score(y_test,y_pred,  average = 'micro'))
print("F1 score is: ", f1_score(y_test,y_pred, average = 'micro'))

Accuracy is:  0.9777777777777777
Precision is:  0.9777777777777777
Recall is:  0.9777777777777777
F1 score is:  0.9777777777777777


In [39]:
confusion_matrix (y_test, y_pred)

array([[17,  0,  0],
       [ 0, 14,  1],
       [ 0,  0, 13]])

In [40]:
pd.Series(rf.feature_importances_, index = x.columns).sort_values(ascending=False)*100

PW      23.474078
PL      22.078540
mean    15.078834
sum     14.735774
kurt    14.009530
SL       4.568870
SW       3.816041
skew     2.238333
dtype: float64