In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [14]:
# load dataset#1
file_path = "/Users/maissanafisa/Desktop/data.csv"
data = pd.read_csv(file_path)
data = pd.read_csv(file_path, delimiter=';')
data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [15]:
from sklearn.preprocessing import LabelEncoder

# Graduate is encoded as 1, and Dropout as 0
label_encoder = LabelEncoder()
data['Target'] = label_encoder.fit_transform(data['Target'])  

X = data.drop(columns=['Target'])
y = data['Target']

missing_values = X.isnull().sum().sum()

X.head(), y.head(), f"Missing values in features: {missing_values}"


(   Marital status  Application mode  Application order  Course  \
 0               1                17                  5     171   
 1               1                15                  1    9254   
 2               1                 1                  5    9070   
 3               1                17                  2    9773   
 4               2                39                  1    8014   
 
    Daytime/evening attendance\t  Previous qualification  \
 0                             1                       1   
 1                             1                       1   
 2                             1                       1   
 3                             1                       1   
 4                             0                       1   
 
    Previous qualification (grade)  Nacionality  Mother's qualification  \
 0                           122.0            1                      19   
 1                           160.0            1                       1   
 2       

In [16]:
test_sizes = [0.2, 0.5, 0.8]

In [17]:
# Random Forest Classifier
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20,50]
}

def hyperparameter_tuning_rf(X_train, y_train):
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid_rf,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.cv_results_

# Evaluate Random Forest Classifier
partition_results_rf = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params, cv_results = hyperparameter_tuning_rf(X_train, y_train)
    preds_train = tuned_clf.predict(X_train)
    preds_test = tuned_clf.predict(X_test)

    train_accuracy = accuracy_score(y_train, preds_train)
    test_accuracy = accuracy_score(y_test, preds_test)
    validation_accuracy = max(cv_results['mean_test_score'])  

    results = [{
        'Classifier': 'Random Forest',
        'Train Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(validation_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Best Parameters': best_params
    }]

    partition_results_rf[f"Random Forest - Test size: {test_size}"] = pd.DataFrame(results)

# Display Random Forest results
for partition, df in partition_results_rf.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


Random Forest - Test size: 0.2 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                        |
|:--------------|-----------------:|----------------------:|----------------:|:---------------------------------------|
| Random Forest |           0.9994 |                 0.768 |           0.774 | {'max_depth': 20, 'n_estimators': 200} |


Random Forest - Test size: 0.5 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                          |
|:--------------|-----------------:|----------------------:|----------------:|:-----------------------------------------|
| Random Forest |                1 |                0.7731 |          0.7685 | {'max_depth': None, 'n_estimators': 300} |


Random Forest - Test size: 0.8 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                        |
|:--------------|-------------

In [21]:
# Logistic Regression Classifier

# Hyperparameter grid for Logistic Regression
partition_results = {}
test_sizes = [0.2, 0.5, 0.8]
param_grid_lr = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__solver': ['lbfgs', 'liblinear', 'saga']
}

# Hyperparameter tuning function
def hyperparameter_tuning_lr(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=2000, random_state=42))
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_lr,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.cv_results_

# Evaluate Logistic Regression Classifier
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params, cv_results = hyperparameter_tuning_lr(X_train, y_train)
    preds_train = tuned_clf.predict(X_train)
    preds_test = tuned_clf.predict(X_test)

    train_accuracy = accuracy_score(y_train, preds_train)
    validation_accuracy = max(cv_results['mean_test_score'])
    test_accuracy = accuracy_score(y_test, preds_test)

    results = [{
        'Classifier': 'Logistic Regression',
        'Train Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(validation_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Best Parameters': best_params
    }]

    partition_results[f"Logistic Regression - Test size: {test_size}"] = pd.DataFrame(results)

# Display Logistic Regression results
for partition, df in partition_results.items():
    if "Logistic Regression" in partition:
        print(f"{partition} Results:")
        print(df.to_markdown(index=False))
        print("\n")


Logistic Regression - Test size: 0.2 Results:
| Classifier          |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                                 |
|:--------------------|-----------------:|----------------------:|----------------:|:------------------------------------------------|
| Logistic Regression |           0.7765 |                0.7624 |          0.7627 | {'model__C': 100, 'model__solver': 'liblinear'} |


Logistic Regression - Test size: 0.5 Results:
| Classifier          |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                               |
|:--------------------|-----------------:|----------------------:|----------------:|:----------------------------------------------|
| Logistic Regression |           0.7749 |                0.7572 |          0.7654 | {'model__C': 1, 'model__solver': 'liblinear'} |


Logistic Regression - Test size: 0.8 Results:
| Classifier          |   Train Accuracy |   Validatio

In [22]:
# Hyperparameter grid for SVM
param_grid_svm = {
    'model__C': [0.1, 1, 10, 100],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

# Hyperparameter tuning function for SVM
def hyperparameter_tuning_svm(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(random_state=42))
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_svm,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate SVM Classifier
partition_results_svm = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_svm(X_train, y_train)

    # Training Accuracy
    train_accuracy = tuned_clf.score(X_train, y_train)

    # Validation Accuracy (Cross-Validation)
    val_accuracy = cross_val_score(tuned_clf, X_train, y_train, cv=3, scoring='accuracy').mean()

    # Test Accuracy
    preds = tuned_clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, preds)

    # Classification report
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Support Vector Machine',
        'Training Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(val_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_svm[f"SVM - Test size: {test_size}"] = pd.DataFrame(results)

# Display SVM results
for partition, df in partition_results_svm.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


SVM - Test size: 0.2 Results:
| Classifier             |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                       |
|:-----------------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|:----------------------------------------------------------------------|
| Support Vector Machine |              0.7723 |                0.7626 |          0.7537 |      0.7466 |   0.7537 |     0.7452 | {'model__C': 100, 'model__gamma': 'scale', 'model__kernel': 'linear'} |


SVM - Test size: 0.5 Results:
| Classifier             |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                       |
|:-----------------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|:-----

In [23]:
def evaluate_naive_bayes(X_train, X_test, y_train, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)

    # Training Accuracy
    train_accuracy = model.score(X_train, y_train)

    # Validation Accuracy (Cross-Validation)
    val_accuracy = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

    # Test Accuracy
    preds = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, preds)

    # Classification report
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Naive Bayes',
        'Training Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(val_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4)
    }]

    return results

# Evaluate Naive Bayes Classifier
partition_results_nb = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    results = evaluate_naive_bayes(X_train, X_test, y_train, y_test)
    partition_results_nb[f"Naive Bayes - Test size: {test_size}"] = pd.DataFrame(results)

# Display Naive Bayes results
for partition, df in partition_results_nb.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


Naive Bayes - Test size: 0.2 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|
| Naive Bayes  |               0.683 |                0.6734 |          0.6599 |      0.6347 |   0.6599 |     0.6432 |


Naive Bayes - Test size: 0.5 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|
| Naive Bayes  |              0.6849 |                0.6763 |          0.6759 |      0.6564 |   0.6759 |     0.6617 |


Naive Bayes - Test size: 0.8 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|--------------------:|-----------

In [24]:
# Hyperparameter grid for KNN
param_grid_knn = {
    'model__n_neighbors': [3, 5, 7, 10],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2]  # Minkowski p=1 (Manhattan), p=2 (Euclidean)
}

# Hyperparameter tuning function for KNN
def hyperparameter_tuning_knn(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier())
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_knn,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate KNN Classifier
partition_results_knn = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_knn(X_train, y_train)

    # Training Accuracy
    train_accuracy = tuned_clf.score(X_train, y_train)

    # Validation Accuracy (Cross-Validation)
    val_accuracy = cross_val_score(tuned_clf, X_train, y_train, cv=3, scoring='accuracy').mean()

    # Test Accuracy
    preds = tuned_clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, preds)

    # Classification report
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'KNN',
        'Training Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(val_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_knn[f"KNN - Test size: {test_size}"] = pd.DataFrame(results)

# Display KNN results
for partition, df in partition_results_knn.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


KNN - Test size: 0.2 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                        |
|:-------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|:-----------------------------------------------------------------------|
| KNN          |              0.7689 |                0.7172 |          0.7175 |      0.7037 |   0.7175 |     0.6987 | {'model__n_neighbors': 10, 'model__p': 1, 'model__weights': 'uniform'} |


KNN - Test size: 0.5 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                        |
|:-------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|:---------------------------------------------------

In [25]:
#load dataset 2
file_path = "/Users/maissanafisa/Downloads/online_shoppers_intention.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [26]:
label_encoder = LabelEncoder()

# Encode 'VisitorType' (e.g., Returning_Visitor -> 0, Others -> 1)
data['VisitorType'] = label_encoder.fit_transform(data['VisitorType'])

# Convert 'Weekend' and 'Revenue' from boolean to integer (True -> 1, False -> 0)
data['Weekend'] = data['Weekend'].astype(int)
data['Revenue'] = data['Revenue'].astype(int)
data['Month'] = label_encoder.fit_transform(data['Month'])

# Prepare features (X) and target (y)
X = data.drop(columns=['Revenue'])  # Features
y = data['Revenue']  # Target (binary: 0 for no purchase, 1 for purchase)

missing_values = X.isnull().sum().sum()
X.head(), y.head(), f"Missing values in features: {missing_values}"


(   Administrative  Administrative_Duration  Informational  \
 0               0                      0.0              0   
 1               0                      0.0              0   
 2               0                      0.0              0   
 3               0                      0.0              0   
 4               0                      0.0              0   
 
    Informational_Duration  ProductRelated  ProductRelated_Duration  \
 0                     0.0               1                 0.000000   
 1                     0.0               2                64.000000   
 2                     0.0               1                 0.000000   
 3                     0.0               2                 2.666667   
 4                     0.0              10               627.500000   
 
    BounceRates  ExitRates  PageValues  SpecialDay  Month  OperatingSystems  \
 0         0.20       0.20         0.0         0.0      2                 1   
 1         0.00       0.10         0.0  

In [27]:
# Random Forest Classifier
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20,50]
}

def hyperparameter_tuning_rf(X_train, y_train):
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid_rf,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.cv_results_

# Evaluate Random Forest Classifier
partition_results_rf = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params, cv_results = hyperparameter_tuning_rf(X_train, y_train)
    preds_train = tuned_clf.predict(X_train)
    preds_test = tuned_clf.predict(X_test)

    train_accuracy = accuracy_score(y_train, preds_train)
    test_accuracy = accuracy_score(y_test, preds_test)
    validation_accuracy = max(cv_results['mean_test_score'])  

    results = [{
        'Classifier': 'Random Forest',
        'Train Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(validation_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Best Parameters': best_params
    }]

    partition_results_rf[f"Random Forest - Test size: {test_size}"] = pd.DataFrame(results)

# Display Random Forest results
for partition, df in partition_results_rf.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


Random Forest - Test size: 0.2 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                          |
|:--------------|-----------------:|----------------------:|----------------:|:-----------------------------------------|
| Random Forest |                1 |                 0.905 |          0.8994 | {'max_depth': None, 'n_estimators': 100} |


Random Forest - Test size: 0.5 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                          |
|:--------------|-----------------:|----------------------:|----------------:|:-----------------------------------------|
| Random Forest |                1 |                0.9067 |          0.9019 | {'max_depth': None, 'n_estimators': 200} |


Random Forest - Test size: 0.8 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                        |
|:--------------|-------

In [28]:
# Logistic Regression Classifier

# Hyperparameter grid for Logistic Regression
partition_results = {}
test_sizes = [0.2, 0.5, 0.8]
param_grid_lr = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__solver': ['lbfgs', 'liblinear', 'saga']
}

# Hyperparameter tuning function
def hyperparameter_tuning_lr(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=2000, random_state=42))
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_lr,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.cv_results_

# Evaluate Logistic Regression Classifier
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params, cv_results = hyperparameter_tuning_lr(X_train, y_train)
    preds_train = tuned_clf.predict(X_train)
    preds_test = tuned_clf.predict(X_test)

    train_accuracy = accuracy_score(y_train, preds_train)
    validation_accuracy = max(cv_results['mean_test_score'])
    test_accuracy = accuracy_score(y_test, preds_test)

    results = [{
        'Classifier': 'Logistic Regression',
        'Train Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(validation_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Best Parameters': best_params
    }]

    partition_results[f"Logistic Regression - Test size: {test_size}"] = pd.DataFrame(results)

# Display Logistic Regression results
for partition, df in partition_results.items():
    if "Logistic Regression" in partition:
        print(f"{partition} Results:")
        print(df.to_markdown(index=False))
        print("\n")


Logistic Regression - Test size: 0.2 Results:
| Classifier          |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                           |
|:--------------------|-----------------:|----------------------:|----------------:|:------------------------------------------|
| Logistic Regression |           0.8854 |                0.8849 |          0.8832 | {'model__C': 1, 'model__solver': 'lbfgs'} |


Logistic Regression - Test size: 0.5 Results:
| Classifier          |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                           |
|:--------------------|-----------------:|----------------------:|----------------:|:------------------------------------------|
| Logistic Regression |           0.8868 |                0.8863 |          0.8822 | {'model__C': 1, 'model__solver': 'lbfgs'} |


Logistic Regression - Test size: 0.8 Results:
| Classifier          |   Train Accuracy |   Validation Accuracy |   Test Accuracy |

In [29]:
# Hyperparameter grid for SVM
param_grid_svm = {
    'model__C': [0.1, 1, 10, 100],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

# Hyperparameter tuning function for SVM
def hyperparameter_tuning_svm(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(random_state=42))
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_svm,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate SVM Classifier
partition_results_svm = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_svm(X_train, y_train)

    # Training Accuracy
    train_accuracy = tuned_clf.score(X_train, y_train)

    # Validation Accuracy (Cross-Validation)
    val_accuracy = cross_val_score(tuned_clf, X_train, y_train, cv=3, scoring='accuracy').mean()

    # Test Accuracy
    preds = tuned_clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, preds)

    # Classification report
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Support Vector Machine',
        'Training Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(val_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_svm[f"SVM - Test size: {test_size}"] = pd.DataFrame(results)

# Display SVM results
for partition, df in partition_results_svm.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


SVM - Test size: 0.2 Results:
| Classifier             |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                  |
|:-----------------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|:-----------------------------------------------------------------|
| Support Vector Machine |              0.9074 |                 0.892 |          0.8844 |      0.8727 |   0.8844 |     0.8724 | {'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'rbf'} |


SVM - Test size: 0.5 Results:
| Classifier             |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                  |
|:-----------------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|:-------------------------

In [30]:
from sklearn.model_selection import cross_val_score

def evaluate_naive_bayes(X_train, X_test, y_train, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)

    # Training Accuracy
    train_accuracy = model.score(X_train, y_train)

    # Validation Accuracy (Cross-Validation)
    val_accuracy = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

    # Test Accuracy
    preds = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, preds)

    # Classification report
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Naive Bayes',
        'Training Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(val_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4)
    }]

    return results

# Evaluate Naive Bayes Classifier
partition_results_nb = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    results = evaluate_naive_bayes(X_train, X_test, y_train, y_test)
    partition_results_nb[f"Naive Bayes - Test size: {test_size}"] = pd.DataFrame(results)

# Display Naive Bayes results
for partition, df in partition_results_nb.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


Naive Bayes - Test size: 0.2 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|
| Naive Bayes  |              0.8479 |                0.8483 |          0.8439 |      0.8474 |   0.8439 |     0.8456 |


Naive Bayes - Test size: 0.5 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|
| Naive Bayes  |              0.8509 |                0.8503 |          0.8503 |      0.8546 |   0.8503 |     0.8523 |


Naive Bayes - Test size: 0.8 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|--------------------:|-----------

In [31]:
# Hyperparameter grid for KNN
param_grid_knn = {
    'model__n_neighbors': [3, 5, 7, 10],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2]  # Minkowski p=1 (Manhattan), p=2 (Euclidean)
}

# Hyperparameter tuning function for KNN
def hyperparameter_tuning_knn(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier())
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_knn,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate KNN Classifier
partition_results_knn = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_knn(X_train, y_train)

    # Training Accuracy
    train_accuracy = tuned_clf.score(X_train, y_train)

    # Validation Accuracy (Cross-Validation)
    val_accuracy = cross_val_score(tuned_clf, X_train, y_train, cv=3, scoring='accuracy').mean()

    # Test Accuracy
    preds = tuned_clf.predict(X_test)
    test_accuracy = accuracy_score(y_test, preds)

    # Classification report
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'KNN',
        'Training Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(val_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_knn[f"KNN - Test size: {test_size}"] = pd.DataFrame(results)

# Display KNN results
for partition, df in partition_results_knn.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


KNN - Test size: 0.2 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                       |
|:-------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|:----------------------------------------------------------------------|
| KNN          |              0.8988 |                0.8807 |          0.8783 |      0.8644 |   0.8783 |     0.8626 | {'model__n_neighbors': 7, 'model__p': 2, 'model__weights': 'uniform'} |


KNN - Test size: 0.5 Results:
| Classifier   |   Training Accuracy |   Validation Accuracy |   Test Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                         |
|:-------------|--------------------:|----------------------:|----------------:|------------:|---------:|-----------:|:-----------------------------------------------------

In [32]:
#load dataset 3
file_path = "/Users/maissanafisa/Downloads/Maternal Health Risk Data Set.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [33]:
label_encoder = LabelEncoder()
data['RiskLevel'] = label_encoder.fit_transform(data['RiskLevel'])  # "high risk" -> 0, "low risk" -> 1

# Split into features (X) and target (y)
X = data.drop(columns=['RiskLevel'])  
y = data['RiskLevel'] 

missing_values = X.isnull().sum().sum()

print(X.head())  
print(y.head())  
print(f"Missing values in features: {missing_values}")

   Age  SystolicBP  DiastolicBP    BS  BodyTemp  HeartRate
0   25         130           80  15.0      98.0         86
1   35         140           90  13.0      98.0         70
2   29          90           70   8.0     100.0         80
3   30         140           85   7.0      98.0         70
4   35         120           60   6.1      98.0         76
0    0
1    0
2    0
3    0
4    1
Name: RiskLevel, dtype: int64
Missing values in features: 0


In [34]:
# Define hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20]
}

# Hyperparameter tuning function for Random Forest
def hyperparameter_tuning_rf(X_train, y_train):
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid_rf,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate Random Forest Classifier
partition_results_rf = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_rf(X_train, y_train)
    preds = tuned_clf.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Random Forest',
        'Accuracy': round(accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_rf[f"Random Forest - Test size: {test_size}"] = pd.DataFrame(results)

# Display Random Forest results
for partition, df in partition_results_rf.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")

Random Forest - Test size: 0.2 Results:
| Classifier    |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                          |
|:--------------|-----------:|------------:|---------:|-----------:|:-----------------------------------------|
| Random Forest |     0.8621 |      0.8697 |   0.8621 |     0.8634 | {'max_depth': None, 'n_estimators': 200} |


Random Forest - Test size: 0.5 Results:
| Classifier    |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                        |
|:--------------|-----------:|------------:|---------:|-----------:|:---------------------------------------|
| Random Forest |     0.7633 |      0.7668 |   0.7633 |     0.7559 | {'max_depth': 10, 'n_estimators': 300} |


Random Forest - Test size: 0.8 Results:
| Classifier    |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                          |
|:--------------|-----------:|------------:|---------:|-----------:|:-----------------------------

In [35]:
# Define hyperparameter grid for Logistic Regression
param_grid_lr = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__solver': ['lbfgs', 'liblinear']
}

# Hyperparameter tuning function for Logistic Regression
def hyperparameter_tuning_lr(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=2000, random_state=42))
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_lr,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate Logistic Regression Classifier
partition_results_lr = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_lr(X_train, y_train)
    preds = tuned_clf.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Logistic Regression',
        'Accuracy': round(accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_lr[f"Logistic Regression - Test size: {test_size}"] = pd.DataFrame(results)

# Display Logistic Regression results
for partition, df in partition_results_lr.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


Logistic Regression - Test size: 0.2 Results:
| Classifier          |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                  |
|:--------------------|-----------:|------------:|---------:|-----------:|:-------------------------------------------------|
| Logistic Regression |      0.665 |      0.6874 |    0.665 |     0.6336 | {'model__C': 0.01, 'model__solver': 'liblinear'} |


Logistic Regression - Test size: 0.5 Results:
| Classifier          |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                 |
|:--------------------|-----------:|------------:|---------:|-----------:|:------------------------------------------------|
| Logistic Regression |      0.641 |      0.6361 |    0.641 |     0.6151 | {'model__C': 0.1, 'model__solver': 'liblinear'} |


Logistic Regression - Test size: 0.8 Results:
| Classifier          |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters               

In [36]:
# Define hyperparameter grid for SVM
param_grid_svm = {
    'model__C': [0.1, 1, 10, 100],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

# Hyperparameter tuning function for SVM
def hyperparameter_tuning_svm(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(random_state=42))
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_svm,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate SVM Classifier
partition_results_svm = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_svm(X_train, y_train)
    preds = tuned_clf.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Support Vector Machine',
        'Accuracy': round(accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_svm[f"SVM - Test size: {test_size}"] = pd.DataFrame(results)

# Display SVM results
for partition, df in partition_results_svm.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


SVM - Test size: 0.2 Results:
| Classifier             |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                    |
|:-----------------------|-----------:|------------:|---------:|-----------:|:-------------------------------------------------------------------|
| Support Vector Machine |     0.7389 |       0.747 |   0.7389 |     0.7379 | {'model__C': 100, 'model__gamma': 'scale', 'model__kernel': 'rbf'} |


SVM - Test size: 0.5 Results:
| Classifier             |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                    |
|:-----------------------|-----------:|------------:|---------:|-----------:|:-------------------------------------------------------------------|
| Support Vector Machine |     0.6765 |      0.6692 |   0.6765 |     0.6691 | {'model__C': 100, 'model__gamma': 'scale', 'model__kernel': 'rbf'} |


SVM - Test size: 0.8 Results:
| Classifier            

In [37]:
def evaluate_naive_bayes(X_train, X_test, y_train, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Naive Bayes',
        'Accuracy': round(accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4)
    }]

    return results

# Evaluate Naive Bayes Classifier
partition_results_nb = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    results = evaluate_naive_bayes(X_train, X_test, y_train, y_test)
    partition_results_nb[f"Naive Bayes - Test size: {test_size}"] = pd.DataFrame(results)

# Display Naive Bayes results
for partition, df in partition_results_nb.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")

Naive Bayes - Test size: 0.2 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|-----------:|------------:|---------:|-----------:|
| Naive Bayes  |     0.6158 |       0.601 |   0.6158 |      0.575 |


Naive Bayes - Test size: 0.5 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|-----------:|------------:|---------:|-----------:|
| Naive Bayes  |     0.6095 |      0.5935 |   0.6095 |     0.5598 |


Naive Bayes - Test size: 0.8 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|-----------:|------------:|---------:|-----------:|
| Naive Bayes  |     0.5911 |      0.5778 |   0.5911 |     0.5634 |




In [38]:
param_grid_knn = {
    'model__n_neighbors': [3, 5, 7, 10],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2]  # Minkowski p=1 (Manhattan), p=2 (Euclidean)
}

# Hyperparameter tuning function for KNN
def hyperparameter_tuning_knn(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier())
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_knn,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate KNN Classifier
partition_results_knn = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_knn(X_train, y_train)
    preds = tuned_clf.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'KNN',
        'Accuracy': round(accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_knn[f"KNN - Test size: {test_size}"] = pd.DataFrame(results)

# Display KNN results
for partition, df in partition_results_knn.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")

KNN - Test size: 0.2 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                         |
|:-------------|-----------:|------------:|---------:|-----------:|:------------------------------------------------------------------------|
| KNN          |     0.8079 |      0.8144 |   0.8079 |     0.8095 | {'model__n_neighbors': 10, 'model__p': 2, 'model__weights': 'distance'} |


KNN - Test size: 0.5 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                         |
|:-------------|-----------:|------------:|---------:|-----------:|:------------------------------------------------------------------------|
| KNN          |     0.7712 |      0.7759 |   0.7712 |     0.7708 | {'model__n_neighbors': 10, 'model__p': 1, 'model__weights': 'distance'} |


KNN - Test size: 0.8 Results:
| Classifier   |   Accuracy |   Precision |   Recall |

In [39]:
file_path = "/Users/maissanafisa/Desktop/ai4i2020.csv"
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [48]:
def preprocess_data(data):
    # Initialize LabelEncoder
    label_encoder = LabelEncoder()
    
    # Encode 'Type' column
    data['Type'] = label_encoder.fit_transform(data['Type'])
    
    # Encode 'Product ID' column
    data['Product ID'] = label_encoder.fit_transform(data['Product ID'])
    
    return data

In [49]:
data = preprocess_data(data)

# Define features (X) and target (y)
X = data.drop(columns=['Machine failure'])  # Target is 'Machine failure'
y = data['Machine failure']

# Check for missing values
missing_values = X.isnull().sum().sum()
print(f"Missing values in features: {missing_values}")

X.head(), y.head()

Missing values in features: 0


(   UDI  Product ID  Type  Air temperature [K]  Process temperature [K]  \
 0    1        7003     2                298.1                    308.6   
 1    2        1003     1                298.2                    308.7   
 2    3        1004     1                298.1                    308.5   
 3    4        1005     1                298.2                    308.6   
 4    5        1006     1                298.2                    308.7   
 
    Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  TWF  HDF  PWF  OSF  \
 0                    1551         42.8                0    0    0    0    0   
 1                    1408         46.3                3    0    0    0    0   
 2                    1498         49.4                5    0    0    0    0   
 3                    1433         39.5                7    0    0    0    0   
 4                    1408         40.0                9    0    0    0    0   
 
    RNF  
 0    0  
 1    0  
 2    0  
 3    0  
 4    0  ,
 0   

In [50]:

# Random Forest Classifier
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20,50]
}

def hyperparameter_tuning_rf(X_train, y_train):
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid_rf,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.cv_results_

# Evaluate Random Forest Classifier
partition_results_rf = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params, cv_results = hyperparameter_tuning_rf(X_train, y_train)
    preds_train = tuned_clf.predict(X_train)
    preds_test = tuned_clf.predict(X_test)

    train_accuracy = accuracy_score(y_train, preds_train)
    test_accuracy = accuracy_score(y_test, preds_test)
    validation_accuracy = max(cv_results['mean_test_score'])  

    results = [{
        'Classifier': 'Random Forest',
        'Train Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(validation_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Best Parameters': best_params
    }]

    partition_results_rf[f"Random Forest - Test size: {test_size}"] = pd.DataFrame(results)

# Display Random Forest results
for partition, df in partition_results_rf.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


Random Forest - Test size: 0.2 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                          |
|:--------------|-----------------:|----------------------:|----------------:|:-----------------------------------------|
| Random Forest |                1 |                0.9991 |           0.999 | {'max_depth': None, 'n_estimators': 100} |


Random Forest - Test size: 0.5 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                          |
|:--------------|-----------------:|----------------------:|----------------:|:-----------------------------------------|
| Random Forest |                1 |                 0.999 |          0.9992 | {'max_depth': None, 'n_estimators': 100} |


Random Forest - Test size: 0.8 Results:
| Classifier    |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                          |
|:--------------|-----

In [55]:
# Logistic Regression Classifier

# Hyperparameter grid for Logistic Regression
partition_results = {}
test_sizes = [0.2, 0.5, 0.8]
param_grid_lr = {
    'model__C': [0.01, 0.1, 1, 10, 100],
    'model__solver': ['lbfgs', 'liblinear', 'saga']
}

# Hyperparameter tuning function
def hyperparameter_tuning_lr(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(max_iter=10000, random_state=42))
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_lr,
        cv=3,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.cv_results_

# Evaluate Logistic Regression Classifier
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params, cv_results = hyperparameter_tuning_lr(X_train, y_train)
    preds_train = tuned_clf.predict(X_train)
    preds_test = tuned_clf.predict(X_test)

    train_accuracy = accuracy_score(y_train, preds_train)
    validation_accuracy = max(cv_results['mean_test_score'])
    test_accuracy = accuracy_score(y_test, preds_test)

    results = [{
        'Classifier': 'Logistic Regression',
        'Train Accuracy': round(train_accuracy, 4),
        'Validation Accuracy': round(validation_accuracy, 4),
        'Test Accuracy': round(test_accuracy, 4),
        'Best Parameters': best_params
    }]

    partition_results[f"Logistic Regression - Test size: {test_size}"] = pd.DataFrame(results)

# Display Logistic Regression results
for partition, df in partition_results.items():
    if "Logistic Regression" in partition:
        print(f"{partition} Results:")
        print(df.to_markdown(index=False))
        print("\n")


Logistic Regression - Test size: 0.2 Results:
| Classifier          |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                              |
|:--------------------|-----------------:|----------------------:|----------------:|:---------------------------------------------|
| Logistic Regression |           0.9991 |                0.9991 |           0.999 | {'model__C': 0.01, 'model__solver': 'lbfgs'} |


Logistic Regression - Test size: 0.5 Results:
| Classifier          |   Train Accuracy |   Validation Accuracy |   Test Accuracy | Best Parameters                              |
|:--------------------|-----------------:|----------------------:|----------------:|:---------------------------------------------|
| Logistic Regression |            0.999 |                 0.999 |          0.9992 | {'model__C': 0.01, 'model__solver': 'lbfgs'} |


Logistic Regression - Test size: 0.8 Results:
| Classifier          |   Train Accuracy |   Validation Accuracy |

In [52]:
# Define hyperparameter grid for SVM
param_grid_svm = {
    'model__C': [0.1, 1, 10, 100],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

# Hyperparameter tuning function for SVM
def hyperparameter_tuning_svm(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVC(random_state=42))
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_svm,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate SVM Classifier
partition_results_svm = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_svm(X_train, y_train)
    preds = tuned_clf.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Support Vector Machine',
        'Accuracy': round(accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_svm[f"SVM - Test size: {test_size}"] = pd.DataFrame(results)

# Display SVM results
for partition, df in partition_results_svm.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")


SVM - Test size: 0.2 Results:
| Classifier             |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                       |
|:-----------------------|-----------:|------------:|---------:|-----------:|:----------------------------------------------------------------------|
| Support Vector Machine |      0.999 |       0.999 |    0.999 |      0.999 | {'model__C': 0.1, 'model__gamma': 'scale', 'model__kernel': 'linear'} |


SVM - Test size: 0.5 Results:
| Classifier             |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                       |
|:-----------------------|-----------:|------------:|---------:|-----------:|:----------------------------------------------------------------------|
| Support Vector Machine |     0.9992 |      0.9992 |   0.9992 |     0.9992 | {'model__C': 0.1, 'model__gamma': 'scale', 'model__kernel': 'linear'} |


SVM - Test size: 0.8 Results:
| Clas

In [53]:
def evaluate_naive_bayes(X_train, X_test, y_train, y_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'Naive Bayes',
        'Accuracy': round(accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4)
    }]

    return results

# Evaluate Naive Bayes Classifier
partition_results_nb = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    results = evaluate_naive_bayes(X_train, X_test, y_train, y_test)
    partition_results_nb[f"Naive Bayes - Test size: {test_size}"] = pd.DataFrame(results)

# Display Naive Bayes results
for partition, df in partition_results_nb.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")

Naive Bayes - Test size: 0.2 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|-----------:|------------:|---------:|-----------:|
| Naive Bayes  |      0.998 |       0.998 |    0.998 |      0.998 |


Naive Bayes - Test size: 0.5 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|-----------:|------------:|---------:|-----------:|
| Naive Bayes  |      0.996 |      0.9962 |    0.996 |     0.9961 |


Naive Bayes - Test size: 0.8 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score |
|:-------------|-----------:|------------:|---------:|-----------:|
| Naive Bayes  |     0.9961 |      0.9963 |   0.9961 |     0.9962 |




In [54]:
param_grid_knn = {
    'model__n_neighbors': [3, 5, 7, 10],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2]  # Minkowski p=1 (Manhattan), p=2 (Euclidean)
}

# Hyperparameter tuning function for KNN
def hyperparameter_tuning_knn(X_train, y_train):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', KNeighborsClassifier())
    ])
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid_knn,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Evaluate KNN Classifier
partition_results_knn = {}
for test_size in test_sizes:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    tuned_clf, best_params = hyperparameter_tuning_knn(X_train, y_train)
    preds = tuned_clf.predict(X_test)

    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)

    results = [{
        'Classifier': 'KNN',
        'Accuracy': round(accuracy, 4),
        'Precision': round(report['weighted avg']['precision'], 4),
        'Recall': round(report['weighted avg']['recall'], 4),
        'F1-Score': round(report['weighted avg']['f1-score'], 4),
        'Best Parameters': best_params
    }]

    partition_results_knn[f"KNN - Test size: {test_size}"] = pd.DataFrame(results)

# Display KNN results
for partition, df in partition_results_knn.items():
    print(f"{partition} Results:")
    print(df.to_markdown(index=False))
    print("\n")

KNN - Test size: 0.2 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                       |
|:-------------|-----------:|------------:|---------:|-----------:|:----------------------------------------------------------------------|
| KNN          |      0.999 |       0.999 |    0.999 |      0.999 | {'model__n_neighbors': 3, 'model__p': 1, 'model__weights': 'uniform'} |


KNN - Test size: 0.5 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score | Best Parameters                                                       |
|:-------------|-----------:|------------:|---------:|-----------:|:----------------------------------------------------------------------|
| KNN          |     0.9992 |      0.9992 |   0.9992 |     0.9992 | {'model__n_neighbors': 3, 'model__p': 1, 'model__weights': 'uniform'} |


KNN - Test size: 0.8 Results:
| Classifier   |   Accuracy |   Precision |   Recall |   F1-Score 

In [63]:
import pandas as pd

# Example summary data
summary_data = {
    'Classifier': [
        'Random Forest', 'Random Forest', 'Random Forest',
        'Logistic Regression', 'Logistic Regression', 'Logistic Regression',
        'SVM', 'SVM', 'SVM',
        'Naive Bayes', 'Naive Bayes', 'Naive Bayes',
        'KNN', 'KNN', 'KNN'
    ] * 4,  # Repeat for 4 datasets
    'Dataset': [
        'Graduation Prediction', 'Graduation Prediction', 'Graduation Prediction',
        'Graduation Prediction', 'Graduation Prediction', 'Graduation Prediction',
        'Graduation Prediction', 'Graduation Prediction', 'Graduation Prediction',
        'Graduation Prediction', 'Graduation Prediction', 'Graduation Prediction',
        'Graduation Prediction', 'Graduation Prediction', 'Graduation Prediction',
        'Online Shopper Behavior', 'Online Shopper Behavior', 'Online Shopper Behavior',
        'Online Shopper Behavior', 'Online Shopper Behavior', 'Online Shopper Behavior',
        'Online Shopper Behavior', 'Online Shopper Behavior', 'Online Shopper Behavior',
        'Online Shopper Behavior', 'Online Shopper Behavior', 'Online Shopper Behavior',
        'Online Shopper Behavior', 'Online Shopper Behavior', 'Online Shopper Behavior',
        'Maternal Health Risk', 'Maternal Health Risk', 'Maternal Health Risk',
        'Maternal Health Risk', 'Maternal Health Risk', 'Maternal Health Risk',
        'Maternal Health Risk', 'Maternal Health Risk', 'Maternal Health Risk',
        'Maternal Health Risk', 'Maternal Health Risk', 'Maternal Health Risk',
        'Maternal Health Risk', 'Maternal Health Risk', 'Maternal Health Risk',
        'Machine Failure Prediction', 'Machine Failure Prediction', 'Machine Failure Prediction',
        'Machine Failure Prediction', 'Machine Failure Prediction', 'Machine Failure Prediction',
        'Machine Failure Prediction', 'Machine Failure Prediction', 'Machine Failure Prediction',
        'Machine Failure Prediction', 'Machine Failure Prediction', 'Machine Failure Prediction',
        'Machine Failure Prediction', 'Machine Failure Prediction', 'Machine Failure Prediction'
    ],
    'Train-Test Split': [
        '20/80', '50/50', '80/20',
        '20/80', '50/50', '80/20',
        '20/80', '50/50', '80/20',
        '20/80', '50/50', '80/20',
        '20/80', '50/50', '80/20'
    ] * 4,  # Repeat for 4 datasets
    'Accuracy': [
        0.774, 0.7685, 0.7599,
        0.7627, 0.7654, 0.7523,
        0.7537, 0.7654, 0.7494,
        0.6599, 0.6759, 0.6853,
        0.7175, 0.7129, 0.6912,
        0.8994, 0.9019, 0.9014,
        0.8832, 0.8822, 0.8838,
        0.8844, 0.89, 0.8852,
        0.8439, 0.8503, 0.843,
        0.8783, 0.8775, 0.8722,
        0.8621, 0.7633, 0.7241,
        0.665, 0.641, 0.6121,
        0.7389, 0.6765, 0.6367,
        0.6158, 0.6095, 0.5911,
        0.8079, 0.7712, 0.6601,
        0.999, 0.9992, 0.999,
        0.998, 0.996, 0.9961,
        0.999, 0.9992, 0.999,
        0.999, 0.9992, 0.999,
        0.998, 0.996, 0.9961
    ]
}

# Convert to DataFrame
summary_df = pd.DataFrame(summary_data)

# Pivot the table to organize by classifier and test partitions
pivot_df = summary_df.pivot_table(
    index=['Classifier', 'Dataset'], 
    columns='Train-Test Split', 
    values='Accuracy'
).reset_index()

# Save or display the pivoted DataFrame
output_path = "/Users/maissanafisa/Desktop/Classifier_Performance_Pivoted.csv"
pivot_df.to_csv(output_path, index=False)

print(f"Pivoted table saved to {output_path}")


Pivoted table saved to /Users/maissanafisa/Desktop/Classifier_Performance_Pivoted.csv
