In [162]:
import numpy as np
import pandas as pd


# function load the dataset

In [163]:
def load_data(filepath):
    return pd.read_csv(filepath)


# function to process the dataset

In [164]:
def preprocess_data(df):
    df.drop(columns="Id", inplace=True)
    
    outlier_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
    for col in outlier_columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        upper_limit = q3 + 1.5 * iqr
        lower_limit = q1 - 1.5 * iqr
        df[col] = df[col].clip(lower=lower_limit, upper=upper_limit)
    
    if df.isnull().sum().any():
        print("Missing values found!")
    
    return df


# function to encode dataset 

In [165]:
from sklearn.preprocessing import LabelEncoder


def encode_species(df):
    le = LabelEncoder()
    df['Species'] = le.fit_transform(df['Species'])
    return df


# Function to calculate accuracy


In [166]:
def calculate_accuracy(predicted_y, y):
    correct_y =np.sum((predicted_y==y))
    acc=correct_y/len(y)
    #TP+TN
    #/
    #TP+FP+FN+TN
    return acc*100

### load dataset and process

In [167]:
iris = load_data("Iris.csv")


In [168]:
iris = preprocess_data(iris)


In [169]:
iris_encoded=encode_species(iris)

## try dataset

In [170]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [171]:
iris.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SepalLengthCm,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
SepalWidthCm,150.0,3.050667,0.423036,2.05,2.8,3.0,3.3,4.05
PetalLengthCm,150.0,3.758667,1.76442,1.0,1.6,4.35,5.1,6.9
PetalWidthCm,150.0,1.198667,0.763161,0.1,0.3,1.3,1.8,2.5
Species,150.0,1.0,0.819232,0.0,0.0,1.0,2.0,2.0


In [172]:
iris.corr(numeric_only = True)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
SepalLengthCm,1.0,-0.110343,0.871754,0.817954,0.782561
SepalWidthCm,-0.110343,1.0,-0.419823,-0.355582,-0.419264
PetalLengthCm,0.871754,-0.419823,1.0,0.962757,0.949043
PetalWidthCm,0.817954,-0.355582,0.962757,1.0,0.956464
Species,0.782561,-0.419264,0.949043,0.956464,1.0


In [173]:
iris.isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [174]:
print(iris["Species"].unique())

[0 1 2]


In [175]:
iris_encoded.tail()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2
149,5.9,3.0,5.1,1.8,2


In [176]:
X = iris_encoded[['SepalLengthCm', 'SepalWidthCm','PetalLengthCm','PetalWidthCm']]

y = iris_encoded['Species']

In [177]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gaussian_nb = GaussianNB()

gaussian_nb.fit(X_train, y_train)

In [178]:
y_pred = gaussian_nb.predict(X_test)

In [179]:
#from sklearn.metrics import accuracy_score
accuracy = calculate_accuracy(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 100.0
