In [1]:
!pip install numpy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Load breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the autoencoder architecture
input_layer = Input(shape=(X_train_scaled.shape[1],))
encoded = Dense(32, activation='relu')(input_layer)
decoded = Dense(X_train_scaled.shape[1], activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=128, validation_data=(X_test_scaled, X_test_scaled))

# Use the trained autoencoder to encode the data
encoder = Model(input_layer, encoded)
X_train_encoded = encoder.predict(X_train_scaled)
X_test_encoded = encoder.predict(X_test_scaled)

# Save encoded data to CSV files
df_train_encoded = pd.DataFrame(X_train_encoded, columns=[f'feature_{i}' for i in range(X_train_encoded.shape[1])])
df_test_encoded = pd.DataFrame(X_test_encoded, columns=[f'feature_{i}' for i in range(X_test_encoded.shape[1])])
df_train_encoded.to_csv('encoded_train_data.csv', index=False)
df_test_encoded.to_csv('encoded_test_data.csv', index=False)

# Define classifiers
classifiers = [
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('XGBoost', XGBClassifier(n_estimators=100, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5))
]

results = []

# Iterate over classifiers
for name, clf in classifiers:
    # Train classifier without SMOTE
    clf.fit(X_train_encoded, y_train)
    y_pred_no_smote = clf.predict(X_test_encoded)
    f1_no_smote = f1_score(y_test, y_pred_no_smote)
    results.append((name, 'No SMOTE', f1_no_smote))

    # Apply SMOTE and train classifier
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_encoded, y_train)
    clf.fit(X_resampled, y_resampled)
    y_pred_smote = clf.predict(X_test_encoded)
    f1_smote = f1_score(y_test, y_pred_smote)
    results.append((name, 'SMOTE', f1_smote))

# Create DataFrame from results
df_results = pd.DataFrame(results, columns=['Classifier', 'SMOTE', 'F1 Score'])

# Plotting
plt.figure(figsize=(12, 8))

# Bar chart for F1 scores of each classifier with and without SMOTE
for i, name in enumerate(df_results['Classifier'].unique()):
    df_plot = df_results[df_results['Classifier'] == name]
    plt.subplot(2, 2, i + 1)
    plt.bar(df_plot['SMOTE'], df_plot['F1 Score'], color=['skyblue', 'lightgreen'])
    plt.xlabel('SMOTE')
    plt.ylabel('F1 Score')
    plt.title(f'F1 Score for {name}')
    plt.ylim(0, 1)

# Line plot to compare F1 scores across classifiers
plt.subplot(2, 2, 4)
for smote_status in ['No SMOTE', 'SMOTE']:
    df_plot = df_results[df_results['SMOTE'] == smote_status]
    plt.plot(df_plot['Classifier'], df_plot['F1 Score'], marker='o', label=smote_status)

plt.xlabel('Classifier')
plt.ylabel('F1 Score')
plt.title('F1 Score Comparison Across Classifiers')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()

# Save the plot
plt.savefig('f1_score_comparison_plots.png')

# Show the plot
plt.show()




ModuleNotFoundError: No module named 'numpy'