In [1]:
import pandas as pd

df = pd.read_excel('/content/iris (2).xls')

print("First 5 rows of the DataFrame:")
print(df.head())

print("\nDataFrame Info (data types and non-null values):")
df.info()

First 5 rows of the DataFrame:
    SL   SW   PL   PW Classification
0  5.1  3.5  1.4  0.2    Iris-setosa
1  4.9  3.0  1.4  0.2    Iris-setosa
2  NaN  3.2  1.3  0.2    Iris-setosa
3  4.6  3.1  1.5  0.2    Iris-setosa
4  5.0  3.6  1.4  0.2    Iris-setosa

DataFrame Info (data types and non-null values):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [2]:
print("Missing values before imputation:")
print(df.isnull().sum())

# Impute missing values with the mean of each column
for column in ['SL', 'SW', 'PL']:
    if df[column].isnull().any():
        df[column].fillna(df[column].mean(), inplace=True)

print("\nMissing values after imputation:")
print(df.isnull().sum())

print("\nFirst 5 rows of the DataFrame after imputation:")
print(df.head())

Missing values before imputation:
SL                7
SW                6
PL                6
PW                0
Classification    0
dtype: int64

Missing values after imputation:
SL                0
SW                0
PL                0
PW                0
Classification    0
dtype: int64

First 5 rows of the DataFrame after imputation:
         SL   SW   PL   PW Classification
0  5.100000  3.5  1.4  0.2    Iris-setosa
1  4.900000  3.0  1.4  0.2    Iris-setosa
2  5.855944  3.2  1.3  0.2    Iris-setosa
3  4.600000  3.1  1.5  0.2    Iris-setosa
4  5.000000  3.6  1.4  0.2    Iris-setosa


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode the 'Classification' column
le = LabelEncoder()
df['Classification_encoded'] = le.fit_transform(df['Classification'])

# Define features (X) and target (y)
X = df.drop(['Classification', 'Classification_encoded'], axis=1)
y = df['Classification_encoded']

print("First 5 rows of features (X):")
print(X.head())
print("\nFirst 5 rows of target (y):")
print(y.head())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

First 5 rows of features (X):
         SL   SW   PL   PW
0  5.100000  3.5  1.4  0.2
1  4.900000  3.0  1.4  0.2
2  5.855944  3.2  1.3  0.2
3  4.600000  3.1  1.5  0.2
4  5.000000  3.6  1.4  0.2

First 5 rows of target (y):
0    0
1    0
2    0
3    0
4    0
Name: Classification_encoded, dtype: int64

Shape of X_train: (120, 4)
Shape of X_test: (30, 4)
Shape of y_train: (120,)
Shape of y_test: (30,)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a dictionary of models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=200),
    'SVC': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Iterate through models, train, and evaluate
for name, model in models.items():
    print(f"\n----- {name} -----")
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))



----- Random Forest -----
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


----- Logistic Regression -----
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30


----- SVC -----
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-sco