# DNA Reconstruction
## Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import joblib

 ## Load and Prepare the Data

In [2]:
# Load the dataset
data_url = 'https://raw.githubusercontent.com/shahriar374/futurelight-ml-data/refs/heads/main/dna_seq_full.csv'
data = pd.read_csv(data_url)

# Rename columns for clarity
data.columns = ['Sequence 1', 'Sequence 2', 'New Sequence']

# Encode the sequences using one-hot encoding
def one_hot_encode_dna(sequence, max_len):
    mapping = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]}
    encoded = [mapping[base] for base in sequence]
    
    # Padding or truncating to max_len
    if len(encoded) < max_len:
        encoded += [[0, 0, 0, 0]] * (max_len - len(encoded))
    else:
        encoded = encoded[:max_len]
    
    return np.array(encoded).reshape(-1)

# Define the maximum sequence length (use the longest sequence in your data)
max_len = 10  # Adjust based on the data

# Apply one-hot encoding
X1_encoded = np.array([one_hot_encode_dna(seq, max_len) for seq in data['Sequence 1']])
X2_encoded = np.array([one_hot_encode_dna(seq, max_len) for seq in data['Sequence 2']])

# Concatenate the two sequences as the input
X = np.hstack([X1_encoded, X2_encoded])

# Encode the target sequence (New Sequence)
y = np.array([one_hot_encode_dna(seq, max_len) for seq in data['New Sequence']])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Train the Model

In [3]:
# Create and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Test accuracy
test_score = model.score(X_test, y_test)
print(f"Test Accuracy: {test_score}")

# Save the model
joblib.dump(model, 'dna_reconstruction_model.pkl')

Test Accuracy: 0.723861402505873


['dna_reconstruction_model.pkl']

## Verify the model

In [6]:
# Make a test prediction
test_prediction = model.predict(X_test)
print(test_prediction)

[[0.   0.   1.   ... 0.23 0.45 0.19]
 [0.   0.   1.   ... 0.   0.   0.  ]
 [0.   0.   1.   ... 0.   1.   0.  ]
 ...
 [0.   0.   1.   ... 0.   1.   0.  ]
 [0.   0.   1.   ... 0.   1.   0.  ]
 [0.   0.   1.   ... 0.   0.   1.  ]]
