In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from tkinter import Tk, filedialog

# For inline plotting
%matplotlib inline
sns.set(style="whitegrid")


In [None]:
# Load the CSV
df = pd.read_csv("knn_ready_dataset.csv")

# Show a preview
df.head()


In [None]:
# Define known breast cancer labels
breast_types = ['LuminalA', 'LuminalB', 'HER2', 'Basal', 'NormalLike']

# Filter the dataset
df_bc = df[df['Label'].isin(breast_types)]

print("Filtered dataset shape:", df_bc.shape)
df_bc['Label'].value_counts().plot(kind='bar', title='Breast Cancer Subtype Distribution')
plt.ylabel("Number of Samples")
plt.show()


In [None]:
amino_acids = list("ACDEFGHIKLMNPQRSTVWY")
X = df_bc[amino_acids].values
y = df_bc['Label'].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42, stratify=y)


In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

print("KNN model trained on breast cancer types.")


In [None]:
# Predict on test set
y_pred = knn.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=knn.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn.classes_)
disp.plot(cmap="YlGnBu", xticks_rotation=45)
plt.title("Confusion Matrix - Breast Cancer Subtypes")
plt.show()

# Classification Report
print(" Classification Report:\n")
print(classification_report(y_test, y_pred))


In [None]:
# Helper: AAC calculator
def compute_aac(sequence, amino_acids):
    sequence = sequence.upper()
    L = len(sequence)
    return [sequence.count(aa) / L if L else 0 for aa in amino_acids]

# Let user select a FASTA file
root = Tk()
root.withdraw()
fasta_path = filedialog.askopenfilename(title="Select a FASTA file", filetypes=[("FASTA files", "*.fasta *.fa")])

if fasta_path:
    for record in SeqIO.parse(fasta_path, "fasta"):
        aac_features = compute_aac(str(record.seq), amino_acids)
        aac_scaled = scaler.transform([aac_features])
        prediction = knn.predict(aac_scaled)
        print(f"\n Sequence ID: {record.id}")
        print(f" Predicted Breast Cancer Subtype: {prediction[0]}")
else:
    print(" No file selected.")
