<a href="https://colab.research.google.com/github/samuelAemro12/MachineLeraning_SchoolProject-/blob/main/MlCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Data manipulation, analysis, and Visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score

In [None]:
file_path = "/content/drive/My Drive/ml-project/breast-cancer.csv"
breast_cancer = pd.read_csv(file_path)
breast_cancer.head()

In [None]:
breast_cancer.drop_duplicates(inplace=True)
print("\nNumber of Rows After Dropping Duplicates:", len(breast_cancer))
#original 286

In [None]:
# via Standardization
# Replace '?' with NaN
breast_cancer.replace('?', np.nan, inplace=True)

In [None]:
# Check for missing values
print("\nMissing Values in Each Column:")
print(breast_cancer.isnull().sum())

In [None]:
# Visualize missing values
plt.figure(figsize=(8, 4))
sns.heatmap(breast_cancer.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# Handle missing values
for col in breast_cancer.columns:
    if breast_cancer[col].dtype == 'object':
        breast_cancer[col] = breast_cancer[col].fillna(breast_cancer[col].mode()[0])
    elif breast_cancer[col].isnull().sum() > 0:
        breast_cancer[col] = breast_cancer[col].fillna(breast_cancer[col].median())

In [None]:
# verify missing values after Handle
plt.figure(figsize=(8, 4))
sns.heatmap(breast_cancer.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# Verify that missing values are handled
print("\nMissing Values After Handling:")
print(breast_cancer.isnull().sum())

In [None]:
# Initialize LabelEncoder
le = LabelEncoder()

# Encode all categorical columns
for col in breast_cancer.columns:
    if breast_cancer[col].dtype == 'object':
        breast_cancer[col] = le.fit_transform(breast_cancer[col])
print("\nEncoded Dataset Preview:")
print(breast_cancer.head())

In [None]:
# identify numerical columns
numerical_cols = ['deg-malig'] # only one numerical column in this dataset

# plot histograms for numerical features
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(breast_cancer[col], kde=True, bins = 10)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# separate features (X) and target (y)
X = breast_cancer.drop(columns=['Class'])
y = breast_cancer['Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the splits
print("\nTraining and Testing Set Shapes:")
print("Training set:", X_train.shape, y_train.shape)
print("Testing set:", X_test.shape, y_test.shape)

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
rf_pred = rf_model.predict(X_test)

# Evaluate performance
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred))
print("Accuracy:", accuracy_score(y_test, rf_pred))

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions
nb_pred = nb_model.predict(X_test)

# Evaluate performance
print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, nb_pred))
print("Accuracy:", accuracy_score(y_test, nb_pred))

In [None]:
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

svm_pred = svm_model.predict(X_test)

print("\nSVM Classification Report:")
print(classification_report(y_test, svm_pred))
print("Accuracy:", accuracy_score(y_test, svm_pred))

In [None]:
models = ['Random Forest', 'Naive Bayes', 'SVM']
accuracies = [
    accuracy_score(y_test, rf_pred),
    accuracy_score(y_test, nb_pred),
    accuracy_score(y_test, svm_pred)
]

# Create a DataFrame for comparison
performance_df = pd.DataFrame({'Model': models, 'Accuracy': accuracies})
print("\nModel Comparison:")
print(performance_df)

# Visualize the comparison
plt.figure(figsize=(8, 5))
sns.barplot(x='Model', y='Accuracy', data=performance_df)
plt.title('Model Comparison')
plt.show()

In [None]:
# Evaluate Random Forest
rf_pred = rf_model.predict(X_test)
rf_f1 = f1_score(y_test, rf_pred, pos_label=1)  # Use numeric label for 'recurrence-events'
print("Random Forest F1-Score (Recurrence-Events):", rf_f1)

# Evaluate Naive Bayes
nb_pred = nb_model.predict(X_test)
nb_f1 = f1_score(y_test, nb_pred, pos_label=1)  # Use numeric label for 'recurrence-events'
print("Naive Bayes F1-Score (Recurrence-Events):", nb_f1)

# Evaluate SVM
svm_pred = svm_model.predict(X_test)
svm_f1 = f1_score(y_test, svm_pred, pos_label=1)  # Use numeric label for 'recurrence-events'
print("SVM F1-Score (Recurrence-Events):", svm_f1)

In [None]:
# Store the F1-Scores in a dictionary
f1_scores = {
    'Model': ['Random Forest', 'Naive Bayes', 'SVM'],
    'F1-Score': [rf_f1, nb_f1, svm_f1]
}

# Convert to a DataFrame for easier plotting
f1_df = pd.DataFrame(f1_scores)

# Create a bar plot
plt.figure(figsize=(8, 5))
sns.barplot(x='Model', y='F1-Score', data=f1_df, palette='Blues_d', hue='Model', legend=False)
plt.title('Comparison of F1-Scores for Recurrence-Events')
plt.ylabel('F1-Score')
plt.xlabel('Model')
plt.ylim(0, 1)  # Set y-axis limit between 0 and 1 for better visualization
plt.show()