In [41]:
import os
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [42]:
def merge_dna_files(positive_file, negative_file):
    with open(positive_file, 'r') as pos_file, open(negative_file, 'r') as neg_file:
        pos_lines = pos_file.readlines()
        neg_lines = neg_file.readlines()

    # Determine the file with the lowest number of lines
    min_lines = min(len(pos_lines), len(neg_lines))

    # Merge the files alternately
    merged_lines = []
    for i in range(min_lines):
        if(i!=0):
            merged_lines.append(f'1 {pos_lines[i].strip()}')  # Label positive lines with 1
            merged_lines.append(f'0 {neg_lines[i].strip()}')  # Label negative lines with 0

    # Write merged lines to a new file
    
    output_file = os.path.join('Merge',"merged_file.txt")
    with open(output_file, 'w') as merged_file:
        merged_file.write('\n'.join(merged_lines))
    return merged_lines

In [43]:
#Figuring out all the file situation and merging it
directory = "Output/Access"
neg_directory = "Output/NonAccess"
files = os.listdir(directory)
trim_line = 350 
final_lines = []
for file in files:
    if (file.endswith(".csv")):
        positive_file = file
        negative_file = file.replace("_output.csv","_in_output.csv")
        positive_directory = os.path.join(directory,positive_file)
        negative_directory = os.path.join(neg_directory,negative_file)
        merged_lines = merge_dna_files(positive_directory,negative_directory)
        print (len(merged_lines[1].split()[1]))
        final_lines = final_lines + merged_lines

194
204
196
190
225
182
178
196
191
187


In [44]:
i = 0
cleaned_lines = []
for line in final_lines:
    try:
        atgc = line.split()[1]
        new_atgc = atgc[:178]
        line = final_lines[i].replace(atgc,new_atgc)
        cleaned_lines.append(line)
    except IndexError:
        pass
    i = i + 1


In [45]:
print(len(cleaned_lines))

179382


In [46]:
#One-hot-encoding stuff
import numpy as np

bases = 'ATGC'
one_hot_encode = []
max_seq_length = max(len(line.split(' ', 1)[1]) for line in cleaned_lines)
for line in cleaned_lines:
        label, dna_sequence = line.split(' ', 1)
        label = int(label)
        dna_one_hot = np.zeros((4, max_seq_length),dtype=np.float64)
        for i, base in enumerate(dna_sequence):
            if base.upper() in bases:
                idx = bases.index(base.upper())
                dna_one_hot[idx, i] = 1.0
        one_hot_encode.append((label, dna_one_hot))


In [47]:
print(one_hot_encode[2])

(1, array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0.,
        1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0.,

In [77]:
#Splitting the data into train and test
split_index = int(len(one_hot_encode)*0.8)

train_data_raw = one_hot_encode[:split_index]
test_data_raw = one_hot_encode[split_index:]

#Making the data less to figure stuff out
train_data_raw_main = one_hot_encode[:800]
test_data_raw_main = one_hot_encode[800:1000]

print(len(train_data_raw),len(test_data_raw))


train_data = [sublist[1] for sublist in train_data_raw]
train_data = np.array(train_data)
test_data = [sublist[1] for sublist in test_data_raw]
test_data = np.array(test_data)
train_labels = [sublist[0] for sublist in train_data_raw]
test_labels = [sublist[0] for sublist in test_data_raw]


143505 35877


In [78]:
#The most important class. Creates a custom dataset
class data_class(Dataset):
    def __init__(self,data,label):
        data= torch.tensor(data)
        self.data = data
        labels=torch.tensor(label,dtype=torch.double)
        self.labels = labels
        

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,id):
        data_set=self.data[id]
        labels=self.labels[id]

        return data_set,labels

In [79]:
train_data_main = data_class(train_data,train_labels)
test_data_main = data_class(test_data,test_labels)

first_data, first_label = train_data_main.__getitem__(0)

In [80]:
print(first_data,first_label)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
         0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 1.

In [81]:
# Creating the data loader which is going to load the data to the AI model

train_dataloader=DataLoader(train_data_main,batch_size=2,shuffle=True)
test_dataloader=DataLoader(test_data_main,batch_size=2,shuffle=True)
for data,label in train_dataloader:
    print(label[0])
    break

tensor(0., dtype=torch.float64)


In [82]:
for data,label in train_data_main:
    print(label)
    break

tensor(1., dtype=torch.float64)


In [83]:
class MiniCNN(torch.nn.Module):
    def __init__(self):
        super(MiniCNN, self).__init__()
        self.conv1 = torch.nn.Conv1d(4, 6, kernel_size=5)
        self.pool = torch.nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = torch.nn.Conv1d(6, 16, kernel_size=3)
        self.fc1 = torch.nn.Linear(16 * 42, 120)  # Input size calculated manually
        self.fc2 = torch.nn.Linear(120, 2)

    def forward(self, x):
        x = x.float()
        x = self.pool(torch.nn.functional.relu(self.conv1(x)))
        x = self.pool(torch.nn.functional.relu(self.conv2(x)))
        x = x.view(-1, 16 * 42)  # Reshape x to match the input size of fc1
        x = torch.nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [84]:
class AlexNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=4, out_channels=96, kernel_size=11, stride=4)
        self.local_response1 = nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2)
        self.pool1 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv1d(in_channels=96, out_channels=256, kernel_size=5, padding=2)
        self.local_response2 = nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2)
        self.pool2 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv3 = nn.Conv1d(in_channels=256, out_channels=384, kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(in_channels=384, out_channels=384, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(in_channels=384, out_channels=256, kernel_size=3, padding=1)
        self.pool5 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.fc1 = nn.Linear(in_features= 1024, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=4096)
        self.fc3 = nn.Linear(in_features=4096, out_features=4)

    def forward(self, x):
        x = x.float()
        x = self.pool1(F.relu(self.local_response1(self.conv1(x))))
        x = self.pool2(F.relu(self.local_response2(self.conv2(x))))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool5(F.relu(self.conv5(x)))
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        x = F.relu(self.fc2(x))
        x = F.dropout(x, 0.5)
        x = self.fc3(x)
        return x


In [85]:
# Instantiating the model and assigning an optimizer to the model and creating a loss function
device=torch.device('cuda')
model=AlexNet().to(device)
optimizer=optim.Adam(params=model.parameters(),lr=0.0001)
loss_fn = nn.CrossEntropyLoss()
print(device)

cuda


In [86]:
def train(model,device,train_dataloader,optimizer,epochs):
    print("inside train")
    model.train()
    for batch_ids, (img, classes) in enumerate(train_dataloader):
        classes=classes.type(torch.LongTensor)
        img,classes=img.to(device),classes.to(device)
        torch.autograd.set_detect_anomaly(True)     
        optimizer.zero_grad()
        output=model(img)
        loss = loss_fn(output,classes)                
        
        loss.backward()
        optimizer.step()
    if(batch_ids +1) % 2 == 0:
        print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
            epochs, batch_ids* len(img), len(train_dataloader.dataset),
            100.*batch_ids / len(train_dataloader),loss.item()))

In [87]:
def test(model, device, test_dataloader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for img, classes in test_dataloader:
            img, classes = img.to(device), classes.to(device)
            y_hat = model(img)
            # Move classes to the same device as y_hat and convert its data type if necessary
            if y_hat.dtype != classes.dtype:
                classes = classes.to(y_hat.dtype)
            test_loss += F.nll_loss(y_hat, classes.long(), reduction='sum').item()
            _, y_pred = torch.max(y_hat, 1)
            correct += (y_pred == classes).sum().item()

    test_loss /= len(test_dataloader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_dataloader.dataset),
        100. * correct / len(test_dataloader.dataset)))


In [88]:
if __name__=='__main__':
    seed=42
    EPOCHS=1
    for epoch in range(1,EPOCHS+1):
        train(model,device,train_dataloader,optimizer,epoch)
        test(model,device,test_dataloader)

inside train

Test set: Average loss: -0.4258, Accuracy: 22700/35877 (63%)

