In [None]:
positive_fasta = r'D:\IISER\SEM 7\BI 4164\Markov model order 1\positive_sample.fa'
negative_fasta = r'D:\IISER\SEM 7\BI 4164\Markov model order 1\negative_sample.fa'


In [None]:
from Bio import SeqIO
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve
import numpy as np
import matplotlib.pyplot as plt
import time
import os

def load_fasta(file_path):
    """Load sequences from a FASTA file."""
    print(f"Loading sequences from {file_path}...")
    sequences = [str(record.seq) for record in SeqIO.parse(file_path, "fasta")]
    print(f"Loaded {len(sequences)} sequences from {file_path}")
    return sequences

def create_kmer_features(sequences, k=3, vectorizer=None):
    """Create k-mer features from sequences."""
    print(f"Creating {k}-mer features...")
    start_time = time.time()
    if vectorizer is None:
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(k, k))
        features = vectorizer.fit_transform(sequences)
    else:
        features = vectorizer.transform(sequences)
    print(f"Created features in {time.time() - start_time:.2f} seconds")
    return features, vectorizer

def train_svm_classifier(X, y):
    """Train an SVM classifier."""
    print("Training SVM classifier...")
    start_time = time.time()
    clf = SVC(kernel='rbf', probability=True)
    clf.fit(X, y)
    print(f"Trained classifier in {time.time() - start_time:.2f} seconds")
    return clf

def plot_and_save_roc_curve(y_true, y_score, output_dir):
    """Plot and save ROC curve."""
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'roc_curve.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print(f"ROC curve saved to {os.path.join(output_dir, 'roc_curve.png')}")

def plot_and_save_precision_recall_curve(y_true, y_score, output_dir):
    """Plot and save precision-recall curve."""
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    avg_precision = np.mean(precision)

    plt.figure()
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title(f'Precision-Recall curve: AP={avg_precision:.2f}')
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'precision_recall_curve.png'), dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Precision-Recall curve saved to {os.path.join(output_dir, 'precision_recall_curve.png')}")

def main():
    overall_start_time = time.time()

    # Create output directory for plots
    output_dir = 'svm_classifier_results'
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created output directory: {output_dir}")

    # Load positive and negative sequences
    positive_sequences = load_fasta(positive_fasta)
    negative_sequences = load_fasta(negative_fasta)

    # Combine sequences and create labels
    all_sequences = positive_sequences + negative_sequences
    labels = [1] * len(positive_sequences) + [0] * len(negative_sequences)

    # Split data into training and testing sets
    print("Splitting data into train and test sets...")
    train_sequences, test_sequences, y_train, y_test = train_test_split(all_sequences, labels, test_size=0.2, random_state=42)

    # Create k-mer features
    X_train, vectorizer = create_kmer_features(train_sequences)
    X_test, _ = create_kmer_features(test_sequences, vectorizer=vectorizer)

    # Train SVM classifier
    clf = train_svm_classifier(X_train, y_train)

    # Evaluate the classifier
    print("Evaluating the classifier...")
    y_pred = clf.predict(X_test)
    y_score = clf.predict_proba(X_test)[:, 1]
    
    # Save classification report to file
    report = classification_report(y_test, y_pred)
    with open(os.path.join(output_dir, 'classification_report.txt'), 'w') as f:
        f.write(report)
    print(report)

    # Plot and save ROC curve
    plot_and_save_roc_curve(y_test, y_score, output_dir)

    # Plot and save Precision-Recall curve
    plot_and_save_precision_recall_curve(y_test, y_score, output_dir)

    print(f"Total runtime: {time.time() - overall_start_time:.2f} seconds")

    # Function to classify a new sequence
    def classify_sequence(sequence):
        X_new, _ = create_kmer_features([sequence], vectorizer=vectorizer)
        prob = clf.predict_proba(X_new)[0]
        return "Positive (Promoter)" if prob[1] > 0.5 else "Negative (No Promoter)", prob[1]

    # Example usage

    from Bio import SeqIO

    def extract_sequences(fasta_file):
        """Extracts sequences from a FASTA file.

        Args:
            fasta_file (str): Path to the FASTA file.

        Returns:
            list: A list of tuples, where each tuple contains the sequence ID and sequence.
        """

        sequences = []
        for record in SeqIO.parse(fasta_file, "fasta"):
            sequences.append((record.id, str(record.seq)))
        return sequences

    fasta_file_path = r"D:\IISER\SEM 7\BI 4164\Trying new stuff\test.fa"
    extracted_sequences = extract_sequences(fasta_file_path)

    with open('output_svm.txt', 'w') as f:
        for seq_id, unknown_sequence in extracted_sequences:
            classification, probability = classify_sequence(unknown_sequence)
            out = f"Sequence number: {seq_id}", f"Probability of being a promoter: {probability:.2f}", f"Classification: {classification}"
            f.write(out+"\n")

if __name__ == "__main__":
    main()