In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [19]:


data = pd.read_excel('DryBeanDataSet.xlsx')

# Data Quality Issues Handling

# 1. Missing Values
data = data.replace('?', np.nan)

# Handle missing values for numerical features
numerical_features = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 
                      'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 
                      'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 
                      'ShapeFactor4', 'ShapeFactor5', 'ShapeFactor6']

data_knn = data.copy()
for feature in numerical_features:
    data_knn[feature] = data_knn[feature].fillna(data_knn[feature].median())

data_dt = data.copy()

data_knn = data_knn.dropna(subset=['Class'])
data_dt = data_dt.dropna(subset=['Class'])

# 2. Outliers
def handle_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
        df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    return df

data_knn = handle_outliers(data_knn, numerical_features)
data_dt = handle_outliers(data_dt, numerical_features)

# Remove negative values in ConvexArea
data_knn = data_knn[data_knn['ConvexArea'] >= 0]
data_dt = data_dt[data_dt['ConvexArea'] >= 0]

for df in [data_knn, data_dt]:
    df['Extent'] = pd.to_numeric(df['Extent'], errors='coerce')
    df['Compactness'] = pd.to_numeric(df['Compactness'], errors='coerce')
    df['ShapeFactor6'] = pd.to_numeric(df['ShapeFactor6'], errors='coerce')


  data = data.replace('?', np.nan)


In [20]:

X_knn = data_knn.drop('Class', axis=1)
y_knn = data_knn['Class']
X_dt = data_dt.drop('Class', axis=1)
y_dt = data_dt['Class']

# splitter
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X_knn, y_knn, test_size=0.2, random_state=42, stratify=y_knn)
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X_dt, y_dt, test_size=0.2, random_state=42, stratify=y_dt)

In [21]:
numerical_columns = data_knn.select_dtypes(include=[np.number]).columns
categorical_columns = data_knn.select_dtypes(exclude=[np.number]).columns

# Encode categorical variables
le = LabelEncoder()
for col in categorical_columns:
    data_knn[col] = le.fit_transform(data_knn[col].astype(str))

def plot_pca_variance(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(cumulative_variance_ratio) + 1), cumulative_variance_ratio, 'bo-')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA Cumulative Explained Variance Ratio')
    plt.grid(True)
    plt.savefig('pca_cumulative_variance.png')
    plt.close()
    
    print("PCA plot saved as 'pca_cumulative_variance.png'")
    
    # Return the number of components that explain 95% of the variance
    return np.argmax(cumulative_variance_ratio >= 0.95) + 1

# Apply PCA to numerical features only
X_knn_numerical = data_knn[numerical_columns]
n_components = plot_pca_variance(X_knn_numerical)
print(f"Number of components explaining 95% variance: {n_components}")

# Now, create the final feature set for k-NN
X_knn_pca = pd.DataFrame(
    PCA(n_components=n_components).fit_transform(StandardScaler().fit_transform(X_knn_numerical)),
    columns=[f'PC{i+1}' for i in range(n_components)]
)

# Add back the encoded categorical features
for col in categorical_columns:
    X_knn_pca[col] = data_knn[col]

PCA plot saved as 'pca_cumulative_variance.png'
Number of components explaining 95% variance: 8


In [23]:

numerical_columns = X_train_knn.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X_train_knn.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_columns),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_columns)
    ])

# k-NN specific pipeline
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('knn', KNeighborsClassifier())
])

# Decision Tree specific pipeline
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('dt', DecisionTreeClassifier())
])

knn_param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

dt_param_grid = {
    'dt__max_depth': [5, 10, 15, 20, None],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 2, 4],
    'dt__criterion': ['gini', 'entropy']
}


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

knn_grid_search = GridSearchCV(knn_pipeline, knn_param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
knn_grid_search.fit(X_train_knn, y_train_knn)

dt_grid_search = GridSearchCV(dt_pipeline, dt_param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
dt_grid_search.fit(X_train_dt, y_train_dt)

#best modlle
best_knn = knn_grid_search.best_estimator_
best_dt = dt_grid_search.best_estimator_

In [24]:
def print_results(grid_search, model_name):
    print(f"\n{model_name} Results:")
    print("Best Parameters:", grid_search.best_params_)
    print("Best Cross-Validation Score:", grid_search.best_score_)
    
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    
    print(f"\nCross-validation results:")
    print(f"Mean Accuracy: {means.mean():.4f} (+/- {stds.mean()*2:.4f})")
    print(f"Best Accuracy: {means.max():.4f}")
    print(f"Worst Accuracy: {means.min():.4f}")

print_results(knn_grid_search, "K-Nearest Neighbors")
print_results(dt_grid_search, "Decision Tree")


K-Nearest Neighbors Results:
Best Parameters: {'knn__metric': 'euclidean', 'knn__n_neighbors': 11, 'knn__weights': 'uniform'}
Best Cross-Validation Score: 0.9241307756717667

Cross-validation results:
Mean Accuracy: 0.9198 (+/- 0.0053)
Best Accuracy: 0.9241
Worst Accuracy: 0.9109

Decision Tree Results:
Best Parameters: {'dt__criterion': 'gini', 'dt__max_depth': 20, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 10}
Best Cross-Validation Score: 0.9791243307705969

Cross-validation results:
Mean Accuracy: 0.9683 (+/- 0.0037)
Best Accuracy: 0.9791
Worst Accuracy: 0.8997


In [26]:

knn_predictions = best_knn.predict(X_test_knn)
dt_predictions = best_dt.predict(X_test_dt)
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_true, y_pred)
    print(cm)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'{model_name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()
    
    print(f"Confusion matrix plot saved as '{model_name.lower().replace(' ', '_')}_confusion_matrix.png'")
    print("\n")

evaluate_model(y_test_knn, knn_predictions, "k-NN")
evaluate_model(y_test_dt, dt_predictions, "Decision Tree")

# Print best parameters
print("Best k-NN Parameters:")
print(knn_grid_search.best_params_)
print("\nBest Decision Tree Parameters:")
print(dt_grid_search.best_params_)


k-NN Performance:
Accuracy: 0.9187
Precision: 0.9204
Recall: 0.9187
F1-score: 0.9189

Confusion Matrix:
[[225   0  29   0   1   1   7]
 [  0 104   0   0   0   0   0]
 [  4   4 308   0   5   1   4]
 [  0   0   0 655   1   9  44]
 [  0   0   8   3 365   0   9]
 [  2   0   0   8   0 377  18]
 [  0   0   2  51   6   4 464]]
Confusion matrix plot saved as 'k-nn_confusion_matrix.png'


Decision Tree Performance:
Accuracy: 0.9812
Precision: 0.9812
Recall: 0.9812
F1-score: 0.9812

Confusion Matrix:
[[240   1  18   0   0   0   4]
 [  0 104   0   0   0   0   0]
 [ 15   0 307   0   0   0   4]
 [  0   0   0 708   0   0   1]
 [  0   0   0   0 385   0   0]
 [  0   0   0   0   0 404   1]
 [  7   0   0   0   0   0 520]]
Confusion matrix plot saved as 'decision_tree_confusion_matrix.png'


Best k-NN Parameters:
{'knn__metric': 'euclidean', 'knn__n_neighbors': 11, 'knn__weights': 'uniform'}

Best Decision Tree Parameters:
{'dt__criterion': 'gini', 'dt__max_depth': 20, 'dt__min_samples_leaf': 4, 'dt__min

In [27]:

print("\nk-NN Cross-validation Results:")
print(f"Mean Accuracy: {knn_grid_search.best_score_:.4f}")
print(f"Standard Deviation: {knn_grid_search.cv_results_['std_test_score'][knn_grid_search.best_index_]:.4f}")

print("\nDecision Tree Cross-validation Results:")
print(f"Mean Accuracy: {dt_grid_search.best_score_:.4f}")
print(f"Standard Deviation: {dt_grid_search.cv_results_['std_test_score'][dt_grid_search.best_index_]:.4f}")

# Learning curves
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, X, y, title):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 5), scoring='accuracy')
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy")
    plt.grid()
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    plt.legend(loc="best")
    plt.savefig(f'{title.lower().replace(" ", "_")}_learning_curve.png')
    plt.close()
    print(f"Learning curve saved as '{title.lower().replace(' ', '_')}_learning_curve.png'")

plot_learning_curve(best_knn, X_knn, y_knn, "k-NN Learning Curve")
plot_learning_curve(best_dt, X_dt, y_dt, "Decision Tree Learning Curve")


k-NN Cross-validation Results:
Mean Accuracy: 0.9241
Standard Deviation: 0.0014

Decision Tree Cross-validation Results:
Mean Accuracy: 0.9791
Standard Deviation: 0.0015
Learning curve saved as 'k-nn_learning_curve_learning_curve.png'
Learning curve saved as 'decision_tree_learning_curve_learning_curve.png'
