### AI/ML – Improving Model Performance with Clean Data

**Task 1**: Data Preprocessing for Models

**Objective**: Enhance data quality for better AI/ML outcomes.

**Steps**:
1. Choose a dataset for training an AI/ML model.
2. Identify common data issues like null values, redundant features, or noisydata.
3. Apply preprocessing methods such as imputation, normalization, or feature engineering.

In [4]:
# Write your code from here
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Iris dataset
iris = sns.load_dataset('iris')

# Display the first few rows
print("Original Iris Dataset:")
print(iris.head())

Original Iris Dataset:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [5]:
# Introduce null values
iris_with_na = iris.copy()
np.random.seed(42)
na_indices = np.random.choice(iris.index, size=15, replace=False)
for i in na_indices:
    random_col = np.random.choice(iris.columns[:-1]) # Avoid introducing NA in the target variable
    iris_with_na.loc[i, random_col] = None

# Introduce outliers
iris_with_outliers = iris_with_na.copy()
iris_with_outliers.loc[20, 'sepal_length'] = 15.0
iris_with_outliers.loc[35, 'petal_width'] = 0.1

# Introduce a redundant feature (highly correlated with sepal_length)
iris_with_redundant = iris_with_outliers.copy()
iris_with_redundant['sepal_length_plus_noise'] = iris_with_redundant['sepal_length'] + np.random.normal(0, 0.1, len(iris))

# Display the dataset with introduced issues
print("\nIris Dataset with Introduced Issues (First few rows):")
print(iris_with_redundant.head())

# Check for null values
print("\nNumber of Null Values per Column:")
print(iris_with_redundant.isnull().sum())

# Visualize potential outliers (simple box plots)
import matplotlib.pyplot as plt
sns.boxplot(data=iris_with_redundant.drop(columns=['species']))
plt.title("Box Plots of Features (with potential outliers)")
plt.show()

# Check correlation matrix to see potential redundancy
correlation_matrix = iris_with_redundant.drop(columns=['species']).corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

NameError: name 'np' is not defined

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Handle null values using imputation (mean strategy)
imputer = SimpleImputer(strategy='mean')
iris_imputed = pd.DataFrame(imputer.fit_transform(iris_with_redundant.drop(columns=['species'])), columns=iris.columns[:-1])
iris_imputed['species'] = iris_with_redundant['species']

print("\nIris Dataset After Imputation (First few rows):")
print(iris_imputed.head())
print("\nNumber of Null Values After Imputation:")
print(iris_imputed.isnull().sum())

# Handle outliers (simple capping for demonstration - more sophisticated methods exist)
def cap_outliers(series, lower_quantile=0.01, upper_quantile=0.99):
    lower_bound = series.quantile(lower_quantile)
    upper_bound = series.quantile(upper_quantile)
    return series.clip(lower=lower_bound, upper=upper_bound)

iris_no_outliers = iris_imputed.copy()
for col in iris_no_outliers.columns[:-1]:
    iris_no_outliers[col] = cap_outliers(iris_no_outliers[col])

# Visualize after outlier capping
sns.boxplot(data=iris_no_outliers.drop(columns=['species']))
plt.title("Box Plots After Outlier Capping")
plt.show()

# Handle redundant features (simple removal - more advanced techniques like PCA exist)
iris_no_redundant = iris_no_outliers.drop(columns=['sepal_length_plus_noise'])
print("\nIris Dataset After Redundant Feature Removal (First few rows):")
print(iris_no_redundant.head())

# Feature Scaling (Normalization/Standardization)
scaler = StandardScaler()
X = iris_no_redundant.drop(columns=['species'])
y = iris_no_redundant['species']
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
iris_scaled = pd.concat([X_scaled, y], axis=1)

print("\nScaled Iris Dataset (First few rows):")
print(iris_scaled.head())

# For inconsistent categorical data (not prominent here, but an example):
# iris_processed['species'] = iris_processed['species'].str.lower().str.strip()

print("\nProcessed Iris Dataset (Ready for Model Training - First few rows):")
print(iris_scaled.head())

**Task 2**: Evaluate Model Performance

**Objective**: Assess the impact of data quality improvements on model performance.

**Steps**:
1. Train a simple ML model with and without preprocessing.
2. Analyze and compare model performance metrics to evaluate the impact of data quality strategies.

In [None]:
# Write your code from here
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the Iris dataset again to have a clean starting point for the 'original'
iris_original = sns.load_dataset('iris')

# Introduce the same data quality issues as before
iris_with_issues = iris_original.copy()
np.random.seed(42)
na_indices = np.random.choice(iris_original.index, size=15, replace=False)
for i in na_indices:
    random_col = np.random.choice(iris_original.columns[:-1])
    iris_with_issues.loc[i, random_col] = None
iris_with_outliers = iris_with_issues.copy()
iris_with_outliers.loc[20, 'sepal_length'] = 15.0
iris_with_outliers.loc[35, 'petal_width'] = 0.1
iris_with_redundant = iris_with_outliers.copy()
iris_with_redundant['sepal_length_plus_noise'] = iris_with_redundant['sepal_length'] + np.random.normal(0, 0.1, len(iris_original))

# Preprocess the dataset (as done in the previous task)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Impute missing values
imputer = SimpleImputer(strategy='mean')
iris_imputed = pd.DataFrame(imputer.fit_transform(iris_with_redundant.drop(columns=['species'])), columns=iris_original.columns[:-1])
iris_imputed['species'] = iris_with_redundant['species']

# Cap outliers
def cap_outliers(series, lower_quantile=0.01, upper_quantile=0.99):
    lower_bound = series.quantile(lower_quantile)
    upper_bound = series.quantile(upper_quantile)
    return series.clip(lower=lower_bound, upper=upper_bound)

iris_no_outliers = iris_imputed.copy()
for col in iris_no_outliers.columns[:-1]:
    iris_no_outliers[col] = cap_outliers(iris_no_outliers[col])

# Remove redundant feature
iris_preprocessed = iris_no_outliers.drop(columns=['sepal_length_plus_noise'])

# Scale features
scaler = StandardScaler()
X_processed = iris_preprocessed.drop(columns=['species'])
y_processed = iris_preprocessed['species']
X_scaled = pd.DataFrame(scaler.fit_transform(X_processed), columns=X_processed.columns)
iris_scaled = pd.concat([X_scaled, y_processed], axis=1)

# 1. Train a simple ML model with and without preprocessing

# Model 1: Trained on data with issues
X_original = iris_with_redundant.drop(columns=['species'])
y_original = iris_with_redundant['species']
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_original, y_original, test_size=0.3, random_state=42)

model_original = LogisticRegression(solver='liblinear', multi_class='ovr')
model_original.fit(X_train_orig, y_train_orig)
y_pred_orig = model_original.predict(X_test_orig)
accuracy_original = accuracy_score(y_test_orig, y_pred_orig)

print("\n--- Model Performance on Data with Issues ---")
print(f"Accuracy: {accuracy_original:.4f}")
print("Classification Report:")
print(classification_report(y_test_orig, y_pred_orig))

# Model 2: Trained on preprocessed data
X_processed_scaled = iris_scaled.drop(columns=['species'])
y_processed_scaled = iris_scaled['species']
X_train_processed, X_test_processed, y_train_processed, y_test_processed = train_test_split(X_processed_scaled, y_processed_scaled, test_size=0.3, random_state=42)

model_processed = LogisticRegression(solver='liblinear', multi_class='ovr')
model_processed.fit(X_train_processed, y_train_processed)
y_pred_processed = model_processed.predict(X_test_processed)
accuracy_processed = accuracy_score(y_test_processed, y_pred_processed)

print("\n--- Model Performance on Preprocessed Data ---")
print(f"Accuracy: {accuracy_processed:.4f}")
print("Classification Report:")
print(classification_report(y_test_processed, y_pred_processed))

# 2. Analyze and compare model performance metrics

print("\n--- Comparison of Model Performance ---")
print(f"Accuracy on Data with Issues:     {accuracy_original:.4f}")
print(f"Accuracy on Preprocessed Data: {accuracy_processed:.4f}")

if accuracy_processed > accuracy_original:
    improvement = (accuracy_processed - accuracy_original) / accuracy_original * 100
    print(f"\nAccuracy improved by {improvement:.2f}% after preprocessing.")
elif accuracy_processed < accuracy_original:
    decline = (accuracy_original - accuracy_processed) / accuracy_original * 100
    print(f"\nAccuracy declined by {decline:.2f}% after preprocessing (this could happen due to aggressive preprocessing or the nature of the issues).")
else:
    print("\nAccuracy remained the same after preprocessing.")

print("\nObservations from Classification Reports:")
print("- Look at precision, recall, and F1-score for each class in both reports.")
print("- See if preprocessing helped in better classifying specific species (e.g., higher recall for a previously poorly classified species).")
print("- Note any changes in the support (number of samples) for each class in the test sets (should be similar due to the same split).")

NameError: name 'np' is not defined