# Comparing the performace criteria for different Classification Machine Learning algorithms
1) Support Vector Machines (Kernels - Linear, poly, rbf, sigmoid)
2) Random Forest classifier
3) Neural Networks Regression
4) Logistic Regression
5) K-Nearest Neighbours


In [1]:
# Create a .txt file to store the performance metric of the different machine learning algorithms
open("ML_performance_metric.txt", "w")

<_io.TextIOWrapper name='ML_performance_metric.txt' mode='w' encoding='utf-8'>

## Classification of Breast Cancer dataset using SVM

### Import dependencies

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


### Import the dataset and pre-processing of the data

In [3]:
# Load the dataset
data = pd.read_csv("breast_cancer.csv")

# Dropping the "id" column as it's not useful for prediction
data.drop(columns=["id"], inplace=True)

# Encoding the 'diagnosis' column ('B' -> 0, 'M' -> 1)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

# Separate features (X) and target (y)
X = data.drop(columns=["diagnosis"])
y = data["diagnosis"]

In [4]:
print(data.isna().sum())

diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave_points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


### Handle missing values by imputing with the mean

In [5]:
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

### Training the model using different kernels

In [7]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
file_path = "ML_performance_metric.txt"

for k in kernels:
    print(f"Kernel: {k}")
    
    svm = SVC(kernel = k, random_state = 42)
    svm.fit(X_train, y_train)
    
    y_pred = svm.predict(X_test)
    
    # Calculate PERFORMANCE METRICS
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 30)

    algorithm_name = f"SVM: Kernel - {k}"

    with open(file_path, "a") as file:
        file.write("-" * 50 + "\n")
        file.write(f"Performance Metrics for {algorithm_name} \n")
        file.write(f"Accuracy: {accuracy:.4f} \n")
        file.write(f"Precision: {precision:.4f} \n")
        file.write(f"Recall: {recall:.4f} \n")
        file.write(f"F1 Score: {f1:.4f} \n")

    print(f"Metrics saved to {file_path}")

Kernel: linear


Accuracy: 0.9649
Precision: 0.9672
Recall: 0.9365
F1 Score: 0.9516
------------------------------
Metrics saved to ML_performance_metric.txt
Kernel: poly
Accuracy: 0.9415
Precision: 0.9818
Recall: 0.8571
F1 Score: 0.9153
------------------------------
Metrics saved to ML_performance_metric.txt
Kernel: rbf
Accuracy: 0.9357
Precision: 1.0000
Recall: 0.8254
F1 Score: 0.9043
------------------------------
Metrics saved to ML_performance_metric.txt
Kernel: sigmoid
Accuracy: 0.4678
Precision: 0.1818
Recall: 0.1270
F1 Score: 0.1495
------------------------------
Metrics saved to ML_performance_metric.txt


## Classification of the Breast Cancer dataset using Random Forest (RF) algorithm

### Import dependencies

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


### Import dataset & data pre-processing

In [9]:
data = pd.read_csv("breast_cancer.csv")

# Dropping the "id" column as it's not useful for prediction
data.drop(columns = ["id"], inplace = True)

# Encoding the "diagnosis" column ('B' -> 0, 'M' -> 1)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

# Separate features (X) and target (y)
X = data.drop(columns = ["diagnosis"])
y = data["diagnosis"]

# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy = "mean")
X = imputer.fit_transform(X)

### Split the dataset for Training and Testing

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

### Training the Random Forest algorithm

In [11]:
rf_model = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf_model.fit(X_train, y_train)

# Make predictions using the trained algorithm
y_pred = rf_model.predict(X_test)

# Calculate PERFORMANCE metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9708
Precision: 0.9833
Recall: 0.9365
F1 Score: 0.9593


### Storing the data in a text file

In [12]:
file_path = "ML_performance_metric.txt"
algorithm_name = f"Random Forest classifier \n"

with open(file_path, "a") as file:
    file.write("-" * 50 + "\n")
    file.write(f"Performance Metrics for {algorithm_name} \n")
    file.write(f"Accuracy: {accuracy:.4f} \n")
    file.write(f"Precision: {precision:.4f} \n")
    file.write(f"Recall: {recall:.4f} \n")
    file.write(f"F1 Score: {f1:.4f} \n")

print(f"Metrics saved to {file_path}")

Metrics saved to ML_performance_metric.txt


## Classification of the Breast Cancer dataset using Neural Network Regression (Multi-Layer Perceptron [MLP] Classifier)

### Import dependencies

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

### Import dataset and data pre-processing

In [14]:
data = pd.read_csv("breast_cancer.csv")

data.drop(columns=["id"], inplace=True)

# Encode the 'diagnosis' column ('B' -> 0, 'M' -> 1)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

# Separate features (X) and target (y)
X = data.drop(columns=["diagnosis"])
y = data["diagnosis"]

# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

### Split the dataset for Training and Testing

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

### Training the Neural Network algorithm

In [16]:
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions
y_pred = mlp_model.predict(X_test)

# Calculate PERFORMANCE metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9649
Precision: 0.9672
Recall: 0.9365
F1 Score: 0.9516


### Storing the data in a text file

In [17]:
file_path = "ML_performance_metric.txt"
algorithm_name = f"Neural Network Regression \n"

with open(file_path, "a") as file:
    file.write("-" * 50 + "\n")
    file.write(f"Performance Metrics for {algorithm_name} \n")
    file.write(f"Accuracy: {accuracy:.4f} \n")
    file.write(f"Precision: {precision:.4f} \n")
    file.write(f"Recall: {recall:.4f} \n")
    file.write(f"F1 Score: {f1:.4f} \n")

print(f"Metrics saved to {file_path}")

Metrics saved to ML_performance_metric.txt


## Classification of the Breast Cancer dataset using Logistic Regression

### Import dependencies

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

### Import the dataset and data pre-processing

In [19]:
data = pd.read_csv("breast_cancer.csv")

# Drop the 'id' column as it's not useful for prediction
data.drop(columns=["id"], inplace=True)

# Encode the 'diagnosis' column ('B' -> 0, 'M' -> 1)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

# Separate features (X) and target (y)
X = data.drop(columns=["diagnosis"])
y = data["diagnosis"]

# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(X)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### Splitting the dataset for Training and Testing

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

### Training the Logistic Regression model

In [21]:
logistic_regression_model = LogisticRegression(random_state = 42, max_iter = 1000)
logistic_regression_model.fit(X_train, y_train)

# Make predictions
y_pred = logistic_regression_model.predict(X_test)

# Calculate PERFORMANCE metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9825
Precision: 0.9688
Recall: 0.9841
F1 Score: 0.9764


### Storing the data in a text file

In [22]:
file_path = "ML_performance_metric.txt"
algorithm_name = f"Logistic Regression \n"

with open(file_path, "a") as file:
    file.write("-" * 50 + "\n")
    file.write(f"Performance Metrics for {algorithm_name} \n")
    file.write(f"Accuracy: {accuracy:.4f} \n")
    file.write(f"Precision: {precision:.4f} \n")
    file.write(f"Recall: {recall:.4f} \n")
    file.write(f"F1 Score: {f1:.4f} \n")

print(f"Metrics saved to {file_path}")

Metrics saved to ML_performance_metric.txt


## Classification of the Breast Cancer dataset using K-Nearest Neighbours (KNN) algorithm

### Import dependencies

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

### Import the dataset and data pre-processing

In [24]:
data = pd.read_csv("breast_cancer.csv")

# Drop the 'id' column as it's not useful for prediction
data.drop(columns = ["id"], inplace = True)

# Encode the 'diagnosis' column ('B' -> 0, 'M' -> 1)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

# Separate features (X) and target (y)
X = data.drop(columns = ["diagnosis"])
y = data["diagnosis"]

# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy = "mean")
X = imputer.fit_transform(X)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


### Splitting the dataset for Training and Testing

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 42)

### Training the KNN model

In [26]:
knn_model = KNeighborsClassifier(n_neighbors = 5)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

# Calculate PERFORMANCE metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.9591
Precision: 0.9516
Recall: 0.9365
F1 Score: 0.9440


### Storing the data in a text file

In [27]:
file_path = "ML_performance_metric.txt"
algorithm_name = f"K-Nearest Neighbours \n"

with open(file_path, "a") as file:
    file.write("-" * 50 + "\n")
    file.write(f"Performance Metrics for {algorithm_name} \n")
    file.write(f"Accuracy: {accuracy:.4f} \n")
    file.write(f"Precision: {precision:.4f} \n")
    file.write(f"Recall: {recall:.4f} \n")
    file.write(f"F1 Score: {f1:.4f} \n")

print(f"Metrics saved to {file_path}")

Metrics saved to ML_performance_metric.txt
