In [1]:
import pandas as pd
from Bio import SeqIO
import os
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [2]:
# Correct path to your directory containing FASTA files
fasta_dir = r"C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences"
# List to store sequence data
sequences_data = []

# Loop through each FASTA file in the directory
for fasta_file in os.listdir(fasta_dir):
    if fasta_file.endswith(".fa") or fasta_file.endswith(".fasta"):
        file_path = os.path.join(fasta_dir, fasta_file)
        print(file_path)
        record_seq=""
        record_id=""
        for n,record in enumerate(SeqIO.parse(file_path, "fasta")):
            if (n==0):
                record_id=record.id
            record_seq=record_seq+str(record.seq)
        print(record_id)
        sequences_data.append({
            "sequence_id": record_id,
            "sequence": str(record_seq)
        })

# Create a DataFrame from the sequence data
df = pd.DataFrame(sequences_data)

# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(df.head())

# inspect the DataFrame's structure
print("\nDataFrame Information:")
print(df.info())


C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences\consensus_sequence1.fa
ERR012226_1.fastq
C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences\consensus_sequence10.fa
ERR012227_1.fastq
C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences\consensus_sequence11.fa
ERR012228_1.fastq
C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences\consensus_sequence12.fa
ERR012232_1.fastq
C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences\consensus_sequence13.fa
ERR012260_1.fastq
C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences\consensus_sequence14.fa
ERR012269_1.fastq
C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences\consensus_sequence15.fa
ERR012274_1.fastq
C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences\consensus_sequence16.fa
ERR012277_1.fastq
C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Deskto

In [20]:
# Paths to my data
fasta_dir_path = r'C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\consensus_sequences'

labels_file_path = r'C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\Drugs_Phenotype\Mefloquine Phenotype Dataset for training without NA.csv'
#'C:\Users\FEDGEN-LAB1-SYS7\OneDrive\Desktop\Mercy Akinwale\Drugs_Phenotype\DHA Phenotype Dataset for training.xlsx'
# Function to read FASTA files from a directory and extract sequences
def read_fasta_from_dir(dir_path):
    sequences = {}
    for fasta_file in os.listdir(dir_path):
        if fasta_file.endswith(".fa") or fasta_file.endswith(".fasta"):
            file_path = os.path.join(dir_path, fasta_file)
            for record in SeqIO.parse(file_path, "fasta"):
                sequences[record.id] = str(record.seq)
    return sequences

# Function to one-hot encode the sequence
def one_hot_encode(seq):
    print("---------------"+str(len(seq)))
    mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]}
    # return np.array([mapping[nuc] for nuc in seq.upper() if nuc in mapping])
    return np.array([mapping[nuc] for n,nuc in enumerate(seq.upper()) if ((n<1000000))])

# Read sequences from the FASTA directory
sequences = read_fasta_from_dir(fasta_dir_path)


In [4]:
#print (one_hot_encode(seq))


In [21]:
# Debug: Check the sequences read
print(f"Total sequences read: {len(sequences)}")
for seq_id, seq in list(sequences.items())[:5]:  # Print first 5 sequences
    print(f"ID: {seq_id}, Sequence: {seq[:50]}...")

# Read labels from CSV file
labels_df = pd.read_csv(labels_file_path)
labels_dict = labels_df.set_index('sequence_id').to_dict()['label']  # Assuming 'label' column for binary classification

# Debug: Check the labels read
print(f"Total labels read: {len(labels_dict)}")
for seq_id, label in list(labels_dict.items())[:5]:  # Print first 5 labels
    print(f"ID: {seq_id}, Label: {label}")


Total sequences read: 46
ID: ERR012226_1.fastq, Sequence: TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaaccct...
ID: NC_037280.1, Sequence: aaccctaaaccctaaaccctaaaccctaaaccctaaaccctaaacctaaa...
ID: NC_000521.4, Sequence: TAAACCCTAAATCTCTAAACCCTAAAGCTATACCTAAACCCTGAAGGTTA...
ID: NC_004318.2, Sequence: aaccctaaaccctgaaccctaaaccctaaaccctgaaccctgaaccctaa...
ID: NC_004326.2, Sequence: ctaaaccctgaaccctaaaccctgaaccctaaaccctaaaccctgaaccc...
Total labels read: 32
ID: ERR012494_1.fastq, Label: Resistant
ID: ERR012519_1.fastq, Label: Sensitive
ID: ERR012425_1.fastq, Label: Sensitive
ID: ERR012424_1.fastq, Label: Sensitive
ID: ERR029095_1.fastq, Label: Resistant


In [22]:
# Prepare the sequences and labels for model training
encoded_sequences = []
labels = []

df_dict=df.to_dict("records")

#display(labels_dict)

for seq_dict in df_dict:
    #print(seq_dict['sequence_id'])
    if seq_dict['sequence_id'] in labels_dict:
        #print(len(one_hot_encode(seq_dict['sequence'])))
        #jjfjf
        encoded_sequences.append(one_hot_encode(seq_dict['sequence'].upper()).flatten())
        labels.append(labels_dict[seq_dict['sequence_id']].split(','))

# hhhhg

# Convert to numpy arrays
print([f.shape for f in encoded_sequences])

encoded_sequences = np.array(encoded_sequences)


---------------23292372
---------------23292541
---------------23285585
---------------23292393
---------------23292457
---------------23287492
---------------23289777
---------------23286531
---------------23255091
---------------23292399
---------------23285407
---------------23289447
---------------23286572
---------------23252337
---------------23292541
---------------23287178
---------------23292547
---------------23258039
---------------23289229
---------------23286900
---------------23265735
---------------23292287
---------------23270800
---------------23274363
---------------23255999
---------------23251041
---------------23292244
---------------23288862
---------------23292472
---------------23292141
---------------23292461
[(4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (4000000,), (40

In [17]:
# Debug: Check the encoded sequences
print(f"Encoded sequences shape: {encoded_sequences.shape}")
if encoded_sequences.size > 0:
    print(f"First encoded sequence: {encoded_sequences[0]}")


Encoded sequences shape: (31, 4000000)
First encoded sequence: [0 0 0 ... 0 0 1]


In [18]:
np.array(labels)

array([['Sensitive'],
       ['Resistat'],
       ['Sensitive'],
       ['Sensitive'],
       ['Sensitive'],
       ['Resistant'],
       ['Sensitive'],
       ['Sensitive'],
       ['Sensitive'],
       ['Resistant'],
       ['Sensitive'],
       ['Resistant'],
       ['Sensitive'],
       ['Resistant'],
       ['Sensitive'],
       ['Sensitive'],
       ['Resistant'],
       ['Resistant'],
       ['Resistant'],
       ['Sensitive'],
       ['Sensitive'],
       ['Resistant'],
       ['Resistant'],
       ['Sensitive'],
       ['Sensitive'],
       ['Resistant'],
       ['Resistant'],
       ['Resistant'],
       ['Resistant'],
       ['Resistant'],
       ['Resistant']], dtype='<U9')

In [23]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import KFold

import statistics

# subset_selection=3500000


# Split the data into training and testing sets
if len(encoded_sequences) > 0 and len(labels) > 0:
    X_train, X_test, y_train, y_test = train_test_split(encoded_sequences[:,:], np.array(labels)[:,:], test_size=0.2, random_state=42)

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [2, 4, 6],
        'max_depth': [1, 3, 6, 9],
        'min_samples_split': [2, 5, 10]

    }
    
    print(X_train.shape)

    # Convert y_train to a NumPy array and ensure it is 1D
    y_train = np.array(y_train).ravel()

    # Train a binary classification model using Random Forest
    
    
    kf = KFold(n_splits=5)
    
    ls_accuracy=[]
    ls_precision=[]
    ls_recall=[]
    ls_f1=[]
    ls_roc_auc=[]
    
    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        
        print("#####################################################")
        print("----- CV --- ",(i+1))
        
        clf = RandomForestClassifier(random_state=42)
        
        X_train_cv=X_train[train_index]
        X_test_cv=X_train[test_index]
        
        y_train_cv=y_train[train_index]
        y_test_cv=y_train[test_index]

        # Perform grid search
        grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
        grid_search.fit(X_train_cv, y_train_cv)

        # Print best parameters and best score
        print("Best parameters found: ", grid_search.best_params_)
        print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

        # Use the best estimator from grid search
        best_clf = grid_search.best_estimator_
        
        y_pred = best_clf.predict(X_test_cv)
        y_proba = best_clf.predict_proba(X_test_cv)[:, 1]

        # Calculate and print performance metrics
        accuracy = accuracy_score(y_test_cv, y_pred)
        
        precision = precision_score(y_test_cv, y_pred, pos_label='Sensitive')
        recall = recall_score(y_test_cv, y_pred, pos_label='Sensitive')
        f1 = f1_score(y_test_cv, y_pred, pos_label='Sensitive')
        
        roc_auc=0
        if ((np.unique(y_test_cv).shape[0]>1) and (np.unique(y_proba).shape[0]>1)):
            roc_auc = roc_auc_score(y_test_cv, y_proba)
        
        print("Accuracy :",accuracy)
        print("***************************")
        
        ls_accuracy.append(accuracy)
        ls_precision.append(precision)
        ls_recall.append(recall)
        ls_f1.append(f1)
        ls_roc_auc.append(roc_auc)
    
    print("--------------------------------------------------")
    print("Mean accuracy: ", np.mean(np.array(ls_accuracy)))
    print("Stdev accuracy: ",statistics.stdev(ls_accuracy))
    print("--------------------------------------------------")
    print("Mean precision: ", np.mean(np.array(ls_precision)))
    print("Stdev precision: ",statistics.stdev(ls_precision))
    print("--------------------------------------------------")
    print("Mean recall: ", np.mean(np.array(ls_recall)))
    print("Stdev recall: ",statistics.stdev(ls_recall))
    print("--------------------------------------------------")
    print("Mean f1: ", np.mean(np.array(ls_f1)))
    print("Stdev f1: ",statistics.stdev(ls_f1))
    print("--------------------------------------------------")
    print("Mean roc_auc: ", np.mean(np.array(ls_roc_auc)))
    print("Stdev roc_auc: ",statistics.stdev(ls_roc_auc))
    
    

(24, 4000000)
#####################################################
----- CV ---  1
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'max_depth': 1, 'min_samples_split': 10, 'n_estimators': 4}
Best cross-validation score: 0.58
Accuracy : 0.6
***************************
#####################################################
----- CV ---  2
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'max_depth': 1, 'min_samples_split': 10, 'n_estimators': 2}
Best cross-validation score: 0.63


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy : 0.2
***************************
#####################################################
----- CV ---  3
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 2}
Best cross-validation score: 0.62
Accuracy : 0.4
***************************
#####################################################
----- CV ---  4
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 2}
Best cross-validation score: 0.65
Accuracy : 0.2
***************************
#####################################################
----- CV ---  5
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found:  {'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 2}
Best cross-validation score: 0.45
Accuracy : 0.5
***************************
--------------------------------------------------
Mean accuracy:  0.38
Stdev 

In [None]:
fts_imp=best_clf.feature_importances_

In [None]:
fts_imp.shape

(4000000,)

In [None]:
fts_imp.max()

0.125

In [None]:
[_ for _ in fts_imp if _>0]

[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]

In [None]:
imp_index=np.where(fts_imp>0)

In [None]:
imp_index

(array([1494302, 1527995, 1647964, 1761974, 2428465, 3043709, 3114454,
        3608978], dtype=int64),)

In [None]:
encoded_sequences.shape[1]/4

1000000.0

In [None]:
counter=0
i=0

ls_i=[]

while(counter<encoded_sequences.shape[1]):
    
    counter_old=counter
    counter=counter+4
    
    for g in range(4):
        # print("n")
        _
    
    for k in imp_index[0]:

        if (k>=counter_old and k<=counter):
            ls_i.append(i)
            
    i=i+1
            
    

In [None]:
ls_i

[373575, 381998, 411990, 411991, 440493, 607116, 760927, 778613, 902244]

In [None]:
dict_f={}
for f in ls_i:
    dict_f[f]=[]
    
for seq_dict in df_dict:
    if seq_dict['sequence_id'] in labels_dict:
        for f in ls_i:
            dict_f[f].append(seq_dict['sequence'].upper()[f])

In [None]:
display(dict_f)

{373575: ['A',
  'T',
  'T',
  'T',
  'G',
  'A',
  'C',
  'A',
  'T',
  'T',
  'G',
  'T',
  'A',
  'C',
  'A',
  'T',
  'A',
  'A',
  'T',
  'A',
  'T',
  'A',
  'T',
  'T',
  'A',
  'A',
  'A',
  'A',
  'A',
  'A',
  'A',
  'G',
  'A'],
 381998: ['A',
  'A',
  'T',
  'A',
  'T',
  'C',
  'G',
  'A',
  'T',
  'T',
  'A',
  'G',
  'T',
  'A',
  'T',
  'T',
  'G',
  'G',
  'C',
  'T',
  'A',
  'A',
  'C',
  'A',
  'A',
  'T',
  'A',
  'A',
  'C',
  'G',
  'A',
  'T',
  'A'],
 411990: ['A',
  'A',
  'T',
  'T',
  'C',
  'T',
  'T',
  'A',
  'C',
  'C',
  'T',
  'G',
  'T',
  'A',
  'C',
  'C',
  'C',
  'G',
  'T',
  'T',
  'A',
  'T',
  'T',
  'T',
  'T',
  'T',
  'A',
  'T',
  'T',
  'A',
  'A',
  'A',
  'C'],
 411991: ['T',
  'T',
  'T',
  'T',
  'A',
  'T',
  'A',
  'T',
  'T',
  'A',
  'T',
  'T',
  'T',
  'T',
  'A',
  'A',
  'T',
  'C',
  'T',
  'G',
  'C',
  'T',
  'T',
  'C',
  'T',
  'A',
  'T',
  'T',
  'T',
  'T',
  'T',
  'T',
  'G'],
 440493: ['T',
  'T',
  'C',
  'T',
  'G

In [None]:
pd.DataFrame.from_dict(dict_f)

Unnamed: 0,373575,381998,411990,411991,440493,607116,760927,778613,902244
0,A,A,A,T,T,T,A,A,A
1,T,A,A,T,T,T,A,A,T
2,T,T,T,T,C,A,T,T,T
3,T,A,T,T,T,A,T,T,T
4,G,T,C,A,G,C,C,T,G
5,A,C,T,T,T,T,A,C,A
6,C,G,T,A,A,A,G,C,C
7,A,A,A,T,T,T,A,T,T
8,T,T,C,T,T,A,T,A,G
9,T,T,C,A,G,A,G,T,T


In [None]:
#To print the file out as a csv
# Specify the file name
file_name = 'pd.DataFrame.from_dict(dict_f)'

# Write the DataFrame to a CSV file
df.to_csv(file_name, index=False)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
import seaborn as sns
import matplotlib.pyplot as plt


# Load dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, labels, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame for visualization
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plotting the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Feature Importance from Random Forest')
plt.show()


In [None]:
print (df)

          sequence_id                                           sequence
0   ERR012226_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
1   ERR012227_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
2   ERR012228_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
3   ERR012232_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
4   ERR012260_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
5   ERR012269_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
6   ERR012274_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
7   ERR012277_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
8   ERR012287_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
9   ERR012311_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
10  ERR012317_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
11  ERR012328_1.fastq  TGAACCCTaaaacctaaaccctaaaccctaaaccctgaaccctaaa...
12  ERR012330_1.fastq  TGAACCCTaaaacctaaaccctaaaccc

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode labels to binary format (0 and 1)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train the model using encoded labels
clf.fit(X_train, y_train_encoded)

# Predict using encoded labels
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Calculate performance metrics
accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred)
recall = recall_score(y_test_encoded, y_pred)
f1 = f1_score(y_test_encoded, y_pred)
roc_auc = roc_auc_score(y_test_encoded, y_proba)

# Print performance metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC: {roc_auc}')


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Accuracy: 0.8571428571428571
Precision: 0.8571428571428571
Recall: 1.0
F1 Score: 0.9230769230769231
ROC AUC: 0.6666666666666667
