# Predicting Disease from DNA Sequence

Importing all libraries

In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Reading the data from CSV files hosted in Github

In [4]:
# URLs of your dataset
train_url = "https://raw.githubusercontent.com/shahriar374/futurelight-ml-data/refs/heads/main/disease_detection_dataset_train.csv"
test_url = "https://raw.githubusercontent.com/shahriar374/futurelight-ml-data/refs/heads/main/disease_detection_dataset_test.csv"

# Load training and test datasets from the provided GitHub links
train_data = pd.read_csv(train_url)
test_data = pd.read_csv(test_url)

# Inspect the first few rows of the training data
print(train_data.head())


   DNA Sequence    Associated Disease
0  ACGTACGTAGCA       Cystic Fibrosis
1  TGCATGCATGCA    Sickle Cell Anemia
2  GATCGATCGTAG  Huntington's Disease
3  CGTACGTACGTA    Fragile X Syndrome
4  ATCGATCGTAGC      Beta-Thalassemia


## Function to One-Hot Encode DNA Sequences

In [10]:
# Function to One-Hot Encode DNA Sequences and pad/truncate them to fixed length
def one_hot_encode_dna(sequence, max_len):
    mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]}
    encoded = [mapping[base] for base in sequence]
    
    # Padding/truncating to max_len
    if len(encoded) < max_len:
        # Padding with [0, 0, 0, 0] for shorter sequences
        encoded += [[0, 0, 0, 0]] * (max_len - len(encoded))
    else:
        # Truncate longer sequences
        encoded = encoded[:max_len]
    
    return np.array(encoded)

# Determine the maximum sequence length in both training and test sets
max_len = max(train_data['DNA Sequence'].apply(len).max(), test_data['DNA Sequence'].apply(len).max())


## Apply One-Hot Encoding to all DNA sequences

In [12]:
# Apply One-Hot Encoding and Padding/Truncation to all DNA sequences
train_encoded_sequences = np.array([one_hot_encode_dna(seq, max_len) for seq in train_data['DNA Sequence']])
train_encoded_sequences = train_encoded_sequences.reshape((train_encoded_sequences.shape[0], -1))

test_encoded_sequences = np.array([one_hot_encode_dna(seq, max_len) for seq in test_data['DNA Sequence']])
test_encoded_sequences = test_encoded_sequences.reshape((test_encoded_sequences.shape[0], -1))

## Label Encode the diseases

In [13]:
le = LabelEncoder()
train_encoded_diseases = le.fit_transform(train_data['Associated Disease'])

## Model training

In [14]:
model = RandomForestClassifier()
model.fit(train_encoded_sequences, train_encoded_diseases)

## Save the model

In [23]:
joblib.dump(model, 'dna_disease_model.pkl')
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']

## Testing the model

In [16]:
test_encoded_diseases = le.transform(test_data['Associated Disease'])
y_pred = model.predict(test_encoded_sequences)

## Evaluate the model

In [22]:
accuracy = accuracy_score(test_encoded_diseases, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.7708333333333334


## Predicting from user input

In [19]:
# Example: Predict on a new DNA sequence (from user input)
def predict_disease(dna_sequence):
    # Encode the new DNA sequence
    encoded_sequence = np.array(one_hot_encode_dna(dna_sequence, max_len)).reshape(1, -1)
    # Make prediction
    predicted_label = model.predict(encoded_sequence)
    # Convert the predicted label back to the disease name
    predicted_disease = le.inverse_transform(predicted_label)
    return predicted_disease[0]


## Example Prediction

In [21]:
dna_input = "ACGTACGTAGCA"
print(f"Predicted disease for {dna_input}: {predict_disease(dna_input)}")

Predicted disease for ACGTACGTAGCA: Cystic Fibrosis
