In [65]:
import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
#import matplotlib.pyplot as plt
import numpy as np

In [66]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device
CLASSES = 2

In [67]:
class AlexNetCNN(nn.Module):
    def __init__(self):
        super().__init__()
        #Layer1
        self.conv1 = nn.Conv1d(in_channels=4,out_channels=96,kernel_size=11,stride=4)
        
        #Layer2
        self.conv2 = nn.Conv1d(in_channels=96,out_channels=96,kernel_size=1)
        
        #Layer3
        self.conv3 = nn.Conv1d(in_channels=96,out_channels=96,kernel_size=1)
        self.pool1 = nn.MaxPool1d(kernel_size=3,stride=2)
        
        #Layer4
        self.conv4 = nn.Conv1d(in_channels=96,out_channels=256,kernel_size=11,stride=4,padding=2)
        
        #Layer5
        self.conv5 = nn.Conv1d(in_channels=256,out_channels=256,kernel_size=1)
        
        #Layer6
        self.conv6 = nn.Conv1d(in_channels=256,out_channels=256,kernel_size=1)
        self.pool2 = nn.MaxPool1d(kernel_size=3,stride=2)
        
        #Layer7
        self.conv7 = nn.Conv1d(in_channels=256,out_channels=384,kernel_size=3,stride=1,padding=1)
        
        #Layer8
        self.conv8 = nn.Conv1d(in_channels=384,out_channels=384,kernel_size=1)
        
        #Layer9
        self.conv9 = nn.Conv1d(in_channels=384,out_channels=384,kernel_size=1)
        
        #Layer10
        self.fc1 = nn.Linear(in_features=(384 * 15),out_features=4096)
        
        #Layer11
        self.fc2 = nn.Linear(in_features=(4096),out_features=4096)
        
        #Layer12
        self.fc3 = nn.Linear(in_features=(4096),out_features=CLASSES)
        
    def forward(self, x):
        #Layer1
        x = F.relu(self.conv1(x))
        
        #Layer2
        x = F.relu(self.conv2(x))
        
        #Layer3
        x = F.relu(self.conv3(x))
        x = self.pool1(x)
        x = F.dropout(x, 0.5)
        
        #Layer4
        x = F.relu(self.conv4(x))
        
        #Layer5
        x = F.relu(self.conv5(x))
        
        #Layer6
        x = F.relu(self.conv6(x))
        x = self.pool2(x)
        x = F.dropout(x, 0.5)
        
        #Layer7
        x = F.relu(self.conv7(x))
        
        #Layer8
        x = F.relu(self.conv8(x))
        
        #Layer9
        x = F.relu(self.conv9(x))
        x = F.dropout(x, 0.5)

        x = torch.flatten(x, 1)
        
        #Layer10
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        
        #Layer11
        x = F.relu(self.fc2(x))
        x = F.dropout(x, 0.5)
        
        #Layer12
        x = F.relu(self.fc3(x))
        
        return x

In [68]:
def analyze_data(input_file):
    sequence_length = None
    num_sequences = 0

    with open(input_file, 'r') as file:
        for line in file:
            line = line.strip().split()
            if sequence_length is None:
                sequence_length = len(line[0])
            elif len(line[0]) != sequence_length:
                print("Error: Sequences have different lengths.")
                return

            num_sequences += 1

    return num_sequences, sequence_length

In [69]:
# Reading in and encoding data
def one_hot(input_file, seq_len, num_seq):
    one_hot_data = np.zeros((num_seq, 4, seq_len), dtype=int)
    seq_index = 0

    with open(input_file, 'r') as inf:
        sequences = list(i.split('\t')[0] for i in inf)

    for sequence in sequences:
            
        sequence = sequence.strip()
        char_index = 0
        
        for char in sequence:
            if char == 'a' or char == 'A':
                one_hot_data[seq_index, 0, char_index] = 1
            elif char == 'c' or char == 'C':
                one_hot_data[seq_index, 1, char_index] = 1
            elif char == 'g' or char == 'G':
                one_hot_data[seq_index, 2, char_index] = 1
            elif char == 't' or char == 'T':
                one_hot_data[seq_index, 3, char_index] = 1
            else:
                print("encountered non-acgt")
            
            char_index += 1
        seq_index += 1

    return one_hot_data

In [70]:
! pwd
! ls CS_590_HW/p2/enc/

/users/sdelozi
ENCFF027BPY_pos_enc.txt  ex_enc.txt  FinalFileSmall.txt  T-cell-test.tsv


In [71]:
filein = "/users/sdelozi/CS_590_HW/p2/enc/T-cell-test.tsv"
num_seq, seq_len = analyze_data(filein)
enc_data = one_hot(filein, seq_len, num_seq)
with open(filein, 'r') as inf:
    labels = np.array(list(float(i.split('\t')[1][:1]) for i in inf))

# 60/40 split
enc_train_data, enc_test_data = np.split(enc_data, [int(0.6 * len(enc_data))])
label_train, label_test = np.split(labels, [int(0.6 * len(labels))])
print(enc_data, enc_train_data, enc_test_data)

[[[0 0 0 ... 0 1 0]
  [1 0 0 ... 1 0 1]
  [0 1 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]]

 [[0 0 0 ... 1 0 0]
  [0 1 1 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  [1 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 1 0]
  [1 0 0 ... 1 0 1]
  [0 1 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]]

 ...

 [[0 0 0 ... 1 0 0]
  [0 1 1 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  [1 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 1 0]
  [1 0 0 ... 1 0 1]
  [0 1 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]]

 [[0 0 0 ... 1 0 0]
  [0 1 1 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  [1 0 0 ... 0 0 1]]] [[[0 0 0 ... 0 1 0]
  [1 0 0 ... 1 0 1]
  [0 1 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]]

 [[0 0 0 ... 1 0 0]
  [0 1 1 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  [1 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 1 0]
  [1 0 0 ... 1 0 1]
  [0 1 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]]

 ...

 [[0 0 0 ... 1 0 0]
  [0 1 1 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  [1 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 1 0]
  [1 0 0 ... 1 0 1]
  [0 1 0 ... 0 0 0]
  [0 0 1 ... 0 0 0]]

 [[0 0 0 ... 1 0 0]
  [0 1 1 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  [1 0 0 ... 0 0 1]]] [[[0

In [72]:
# Converting the data to tensor type and floating point type

tensor_train_data = torch.from_numpy(enc_train_data).float()
tensor_test_data = torch.from_numpy(enc_test_data).float()

In [73]:
print(tensor_train_data.shape)
print(tensor_test_data.dtype)

torch.Size([60, 4, 1066])
torch.float32


In [74]:

# The most important class, a custom data loader

class data_class(Dataset):
    def __init__(self,data,label):
        self.data=data
        self.labels=torch.tensor(label)

    def __len__(self):
        return len(self.data) 
    
    def __getitem__(self,id):
        data_set=self.data[id]
        labels=self.labels[id]

        return data_set,labels

In [75]:
# calling the data_class for the raw enc data

train_data=data_class(tensor_train_data,label_train)
test_data=data_class(tensor_test_data,label_test)

In [76]:
# Creating the data loader which is going to load the data to the AI model

train_dataloader=DataLoader(train_data,batch_size=10,shuffle=True)
test_dataloader=DataLoader(test_data,batch_size=10,shuffle=True)

In [77]:
# Instantiating the model and assigning an optimizer to the model and creating a loss function

model=AlexNetCNN().to(device)
optimizer=optim.Adam(params=model.parameters(),lr=0.0001)
loss_fn = nn.CrossEntropyLoss()

In [78]:
def train(model,device,train_dataloader,optimizer,epochs):
    print("inside train")
    model.train()
    for batch_ids, (seq, classes) in enumerate(train_dataloader):
        classes=classes.type(torch.LongTensor)
        seq,classes=seq.to(device),classes.to(device)
        torch.autograd.set_detect_anomaly(True)     
        optimizer.zero_grad()
        output=model(seq)
        loss = loss_fn(output,classes)                
        
        loss.backward()
        optimizer.step()
    if(batch_ids +1) % 2 == 0:
        print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
            epochs, batch_ids* len(seq), len(train_dataloader.dataset),
            100.*batch_ids / len(train_dataloader),loss.item()))

In [79]:
def test(model, device, test_dataloader):
    model.eval()
    test_loss=0
    correct=0
    with torch.no_grad():
        for seq,classes in test_dataloader:
            seq,classes=seq.to(device), classes.to(device)
            y_hat=model(seq)
            test_loss+=F.nll_loss(y_hat,classes.long(),reduction='sum').item()
            _,y_pred=torch.max(y_hat,1)
            correct+=(y_pred==classes).sum().item()
        test_loss/=len(test_dataloader)
        print("\n Test set: Avarage loss: {:.0f},Accuracy:{}/{} ({:.0f}%)\n".format(
            test_loss,correct,len(test_data),100.*correct/len(test_data)))
        print('='*30)

In [80]:
if __name__=='__main__':
    seed=42
    EPOCHS=10
    
    for epoch in range(1,EPOCHS+1):
        train(model,device,train_dataloader,optimizer,epoch)
        test(model,device,test_dataloader)

inside train

 Test set: Avarage loss: -0,Accuracy:19/40 (48%)

inside train

 Test set: Avarage loss: 0,Accuracy:20/40 (50%)

inside train

 Test set: Avarage loss: 0,Accuracy:20/40 (50%)

inside train

 Test set: Avarage loss: 0,Accuracy:20/40 (50%)

inside train

 Test set: Avarage loss: 0,Accuracy:20/40 (50%)

inside train

 Test set: Avarage loss: 0,Accuracy:20/40 (50%)

inside train

 Test set: Avarage loss: 0,Accuracy:20/40 (50%)

inside train


KeyboardInterrupt: 