In [None]:
#Develop a program for Bias, Variance, Remove duplicates, Cross Validation. 

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load the dataset
iris_df = pd.read_csv('Iris.csv')

# Step 2: Remove the 'Id' column (not useful for analysis)
iris_df = iris_df.drop(columns=['Id'])

# Step 3: Check and remove duplicate rows
duplicates_count = iris_df.duplicated().sum()
iris_df = iris_df.drop_duplicates()

# Step 4: Prepare features (X) and target (y)
X = iris_df.drop(columns=['Species'])  # Features
y = iris_df['Species']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 5: Initialize models for Bias and Variance analysis
lr_model = LogisticRegression(max_iter=200)  # Logistic Regression (low variance, potential high bias)
dt_model = DecisionTreeClassifier(random_state=42)  # Decision Tree (higher variance, potentially low bias)

# Train the models
lr_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)

# Make predictions
lr_pred = lr_model.predict(X_test)
dt_pred = dt_model.predict(X_test)

# Calculate accuracy
lr_accuracy = accuracy_score(y_test, lr_pred)
dt_accuracy = accuracy_score(y_test, dt_pred)

# Step 6: Perform Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Logistic Regression Cross Validation
cv_scores_lr = cross_val_score(lr_model, X, y, cv=kf, scoring='accuracy')

# Decision Tree Cross Validation
cv_scores_dt = cross_val_score(dt_model, X, y, cv=kf, scoring='accuracy')

# Step 7: Print Results
print(f"Duplicates Removed: {duplicates_count}")
print(f"Logistic Regression Test Accuracy: {lr_accuracy:.2f}")
print(f"Decision Tree Test Accuracy: {dt_accuracy:.2f}")
print(f"Logistic Regression Cross-Validation Mean Accuracy: {cv_scores_lr.mean():.2f}")
print(f"Decision Tree Cross-Validation Mean Accuracy: {cv_scores_dt.mean():.2f}")

Duplicates Removed: 3
Logistic Regression Test Accuracy: 0.97
Decision Tree Test Accuracy: 0.93
Logistic Regression Cross-Validation Mean Accuracy: 0.96
Decision Tree Cross-Validation Mean Accuracy: 0.96
