In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from IPython.display import display

In [22]:
df=pd.read_csv('iris.csv')

display(df)

df.columns

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


Index(['sepal.length', 'sepal.width', 'petal.length', 'petal.width',
       'variety'],
      dtype='object')

In [23]:
# a. Data cleaning(Remove NA, ?, Negative values etc.)  

df.replace('?',pd.NA,inplace=True)

for col in df.columns:
    if col != "variety":
        df[col] = pd.to_numeric(df[col], errors='coerce')

df.dropna(inplace=True)

# Remove rows with negative values only from numeric columns
numeric_cols = df.select_dtypes(include='number').columns
df = df[df[numeric_cols].ge(0).all(axis=1)]

display(df)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [24]:
# b. Error correcting(Outlier detection and removal)  

# df=df[(df['sepal.length']<8) & (df['sepal.width']<8) & (df['petal.length']<8) & (df['petal.width']<8) ]

from scipy.stats import zscore
import numpy as np
df = df[(np.abs(zscore(df[numeric_cols])) < 3).all(axis=1)]

In [25]:
# c. Data transformation  

scaler= StandardScaler()

features=df.drop(['variety'],axis=1)
features_scaled=scaler.fit_transform(features)

In [26]:
# d.  Build Data model using regression and Naïve Bayes methods and compare accuracy of Iris Species Prediction.  

X=features_scaled
y=df['variety']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

# Logistic Regression
lr=LogisticRegression()
lr.fit(X_train,y_train)
lr_pred=lr.predict(X_test)


# Naive Bayes
nb=GaussianNB()
nb.fit(X_train,y_train)
nb_pred=nb.predict(X_test)



print('Logistic Regression Accuracy:', accuracy_score(y_test,lr_pred))
print('Naive Bayes Accuracy:', accuracy_score(y_test,nb_pred))

Logistic Regression Accuracy: 0.9666666666666667
Naive Bayes Accuracy: 0.9333333333333333


In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('iris.csv')  # or provide the correct path

# Display initial rows
print("Initial Data:")
display(df.head())

# e. Data Cleaning
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)  # Drop missing values
df = df.apply(pd.to_numeric, errors='ignore')  # Convert numeric columns from object

# Remove negative values (biologically not valid in Iris data)
df = df[(df.select_dtypes(include=[np.number]) >= 0).all(axis=1)]

# f. Error Correcting: Outlier Detection using IQR
numerical_cols = df.select_dtypes(include=[np.number]).columns

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

# g. Data Transformation (Scaling)
X = df.drop('variety', axis=1)
y = df['variety']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# h. Model Building and Accuracy Comparison
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_pred)

# Print Results
print("\nLogistic Regression Accuracy:", lr_acc)
print("Naive Bayes Accuracy:", nb_acc)

print("\nClassification Report for Logistic Regression:\n", classification_report(y_test, lr_pred))
print("Classification Report for Naive Bayes:\n", classification_report(y_test, nb_pred))


Initial Data:


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa



Logistic Regression Accuracy: 0.9333333333333333
Naive Bayes Accuracy: 0.9333333333333333

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        12
  Versicolor       0.88      0.88      0.88         8
   Virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.93      0.93      0.93        30

Classification Report for Naive Bayes:
               precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00        12
  Versicolor       0.88      0.88      0.88         8
   Virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.92      0.92      0.92        30
weighted avg       0.93      0.93      0.93        30

