### Reading the dataset to Python

In [1]:
import pandas as pd
import numpy as np

In [2]:
data= pd.read_excel(r"C:\Users\sree0\Downloads\iris.xls")

In [58]:
data.head()

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,5.855944,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
data.shape

(150, 5)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [6]:
data.isna().sum()

SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64

### Handling missing values

In [7]:
data = data.fillna(data.mean())

  data = data.fillna(data.mean())


In [8]:
data

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.100000,3.5,1.40000,0.2,Iris-setosa
1,4.900000,3.0,1.40000,0.2,Iris-setosa
2,5.855944,3.2,1.30000,0.2,Iris-setosa
3,4.600000,3.1,1.50000,0.2,Iris-setosa
4,5.000000,3.6,1.40000,0.2,Iris-setosa
...,...,...,...,...,...
145,6.700000,3.0,5.20000,2.3,Iris-virginica
146,6.300000,2.5,5.00000,1.9,Iris-virginica
147,6.500000,3.0,3.75625,2.0,Iris-virginica
148,6.200000,3.4,5.40000,2.3,Iris-virginica


In [9]:
data.isna().sum()

SL                0
SW                0
PL                0
PW                0
Classification    0
dtype: int64

### Encoding Categorical variables

In [10]:
data['Classification'].nunique()

3

In [11]:
data['Classification'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Classification, dtype: int64

In [12]:
from sklearn.preprocessing import LabelEncoder
Le= LabelEncoder()
data['Classification'] = Le.fit_transform(data['Classification'])

In [13]:
data

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.100000,3.5,1.40000,0.2,0
1,4.900000,3.0,1.40000,0.2,0
2,5.855944,3.2,1.30000,0.2,0
3,4.600000,3.1,1.50000,0.2,0
4,5.000000,3.6,1.40000,0.2,0
...,...,...,...,...,...
145,6.700000,3.0,5.20000,2.3,2
146,6.300000,2.5,5.00000,1.9,2
147,6.500000,3.0,3.75625,2.0,2
148,6.200000,3.4,5.40000,2.3,2


### Splitting into features and target

In [33]:
x = data.drop('Classification', axis=1)  #features

In [34]:
y= data['Classification']  #target

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


### Scaling the feature variables

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


### Comparison of Classification Models

In [39]:
# Create and train the classification models

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [48]:
logistic_regression = LogisticRegression()
decision_tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
svm = SVC()
random_forest = RandomForestClassifier()


In [49]:
# Train the models

logistic_regression.fit(x_train, y_train)
decision_tree.fit(x_train, y_train)
knn.fit(x_train, y_train)
svm.fit(x_train, y_train)
random_forest.fit(x_train, y_train)

In [50]:
# Make predictions

y_pred_lr = logistic_regression.predict(x_test)
y_pred_dt = decision_tree.predict(x_test)
y_pred_knn = knn.predict(x_test)
y_pred_svm = svm.predict(x_test)
y_pred_rf = random_forest.predict(x_test)


In [51]:
# Calculating accuracy scores

accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

In [52]:
# Results

print("Accuracy - Logistic Regression:", accuracy_lr)
print("Accuracy - Decision Tree:", accuracy_dt)
print("Accuracy - K-Nearest Neighbors:", accuracy_knn)
print("Accuracy - Support Vector Machine:", accuracy_svm)
print("Accuracy - Random Forest:", accuracy_rf)

Accuracy - Logistic Regression: 0.9666666666666667
Accuracy - Decision Tree: 1.0
Accuracy - K-Nearest Neighbors: 0.9666666666666667
Accuracy - Support Vector Machine: 0.9666666666666667
Accuracy - Random Forest: 1.0


### Classification report

In [57]:
from sklearn.metrics import classification_report

# True labels
y_true = y_test

# Logistic Regression
precision_logistic_regression, recall_logistic_regression, f1_logistic_regression, _ = classification_report(y_true, y_pred_lr).split()[-4:]

# Decision Tree
precision_decision_tree, recall_decision_tree, f1_decision_tree, _ = classification_report(y_true, y_pred_dt).split()[-4:]

# K-Nearest Neighbors
precision_knn, recall_knn, f1_knn, _ = classification_report(y_true, y_pred_knn).split()[-4:]

# Support Vector Machine
precision_svm, recall_svm, f1_svm, _ = classification_report(y_true, y_pred_svm).split()[-4:]

# Random Forest
precision_random_forest, recall_random_forest, f1_random_forest, _ = classification_report(y_true, y_pred_rf).split()[-4:]

# Printing the Metrics
print("Metrics - Logistic Regression:")
print(f"Precision: {precision_logistic_regression}")
print(f"Recall: {recall_logistic_regression}")
print(f"F1 Score: {f1_logistic_regression}")
print()

print("Metrics - Decision Tree:")
print(f"Precision: {precision_decision_tree}")
print(f"Recall: {recall_decision_tree}")
print(f"F1 Score: {f1_decision_tree}")
print()

print("Metrics - K-Nearest Neighbors:")
print(f"Precision: {precision_knn}")
print(f"Recall: {recall_knn}")
print(f"F1 Score: {f1_knn}")
print()

print("Metrics - Support Vector Machine:")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1 Score: {f1_svm}")
print()

print("Metrics - Random Forest:")
print(f"Precision: {precision_random_forest}")
print(f"Recall: {recall_random_forest}")
print(f"F1 Score: {f1_random_forest}")


Metrics - Logistic Regression:
Precision: 0.97
Recall: 0.97
F1 Score: 0.97

Metrics - Decision Tree:
Precision: 1.00
Recall: 1.00
F1 Score: 1.00

Metrics - K-Nearest Neighbors:
Precision: 0.97
Recall: 0.97
F1 Score: 0.97

Metrics - Support Vector Machine:
Precision: 0.97
Recall: 0.97
F1 Score: 0.97

Metrics - Random Forest:
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


### Results

In [None]:
# The decision tree and random forest models achieved perfect scores, indicating that they performed flawlessly on the dataset, amongst all the models

# The accuracy scores indicate the performance of each model in correctly classifying the iris dataset. A higher accuracy score suggests that the model has achieved a better classification performance on the dataset.

# The precision measures the proportion of true positive predictions out of all positive predictions. It indicates how well the model classifies positive samples.

# The recall, also known as sensitivity or true positive rate, measures the proportion of true positive predictions out of all actual positive samples. It indicates how well the model captures positive samples.

# The F1 score is the harmonic mean of precision and recall. It provides a balanced measure of the model's accuracy, considering  both precision and recall.