# 1D CNN Homework

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import numpy as np

## Data Preprocessing

In [2]:
train_data ='train_features.csv'
test_data ='test_features.csv'
labels = 'train_labels.csv'

In [3]:
def load_and_preprocess_data(filename):
    df = pd.read_csv(filename)
    df['data'] = df['data'].str.replace('[^ACGT]', 'N')
    char_to_int = {'A': 1, 'C': 2, 'G': 3, 'T': 4, 'N': 0}

    data_array = []

    for data_str in df['data']:
        sequence = []
        for char in data_str:
            try:
                sequence.append(char_to_int[char])
            except KeyError: # There are some unexpected characters in the data
                continue

        data_array.append(torch.tensor(sequence))
    padded_sequences = pad_sequence(data_array, batch_first=True)

    return padded_sequences

In [4]:
train_tensor = load_and_preprocess_data(train_data)
test_tensor = load_and_preprocess_data(test_data)

  df['data'] = df['data'].str.replace('[^ACGT]', 'N')
  df['data'] = df['data'].str.replace('[^ACGT]', 'N')


In [5]:
train_tensor.shape

torch.Size([16969, 1058])

In [6]:
test_tensor = F.pad(test_tensor, (0, 196), 'constant', 0)

In [7]:
test_tensor.shape

torch.Size([4243, 1058])

In [8]:
def one_hot_encode(sequence, num_classes=5):
    # One-hot encode
    return F.one_hot(sequence, num_classes=num_classes).float()

In [9]:
train_one_hot_encoded = one_hot_encode(train_tensor)
test_one_hot_encoded = one_hot_encode(test_tensor)

In [10]:
torch.flatten(train_one_hot_encoded, start_dim=1).shape

torch.Size([16969, 5290])

In [11]:
print(train_one_hot_encoded.shape)
print(test_one_hot_encoded.shape)

torch.Size([16969, 1058, 5])
torch.Size([4243, 1058, 5])


## DataLoader

In [12]:
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        label = self.labels[idx]
        return torch.tensor(sequence, dtype=torch.float), torch.tensor(label, dtype=torch.long)

In [13]:
labels_df = pd.read_csv('train_labels.csv')

In [14]:
num_classes = len(labels_df['labels'])

In [15]:
num_classes

16969

## Model

In [16]:
class CNN1D(nn.Module):
    def __init__(self, num_classes):
        super(CNN1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=5, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=5, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(2,2)
        self.fc1 = nn.Linear(1320, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))

        x = torch.flatten(x, start_dim=1)
        x = self.dropout(x)
        # print(x.shape)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x # a softmax for the output is not necessary since I am using nn.CrossEntropyLoss()

In [25]:
model = CNN1D(num_classes=num_classes)
train_set = SequenceDataset(train_one_hot_encoded, labels_df['labels'].values)
trainloader = DataLoader(train_set, batch_size=32, shuffle=True)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

## Training Loop

In [26]:
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for sequences, labels in trainloader:
        # Forward pass
        outputs = model(sequences.permute(0, 2, 1))
        # print(labels.shape)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if loss.item() < 0.05:
            break

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')



  return torch.tensor(sequence, dtype=torch.float), torch.tensor(label, dtype=torch.long)


Epoch [1/100], Loss: 5.7931060791015625
Epoch [2/100], Loss: 4.692102432250977
Epoch [3/100], Loss: 1.4106229543685913
Epoch [4/100], Loss: 2.107818126678467
Epoch [5/100], Loss: 1.018846869468689
Epoch [6/100], Loss: 0.3859853148460388
Epoch [7/100], Loss: 0.15759047865867615
Epoch [8/100], Loss: 0.9121363162994385
Epoch [9/100], Loss: 0.7778366208076477
Epoch [10/100], Loss: 0.03972068056464195


KeyboardInterrupt: 

In [39]:
torch.save(model.state_dict(), 'model_friday_mar_8.pth')

## Prediction

In [40]:
prediction_model = CNN1D(num_classes=num_classes)
prediction_model.load_state_dict(torch.load('model_friday_mar_8.pth'))

prediction_model.eval()

outputs = prediction_model(test_one_hot_encoded.permute(0, 2, 1))

_, preds_tensor = torch.max(outputs, 1)

preds = np.squeeze(preds_tensor.cpu().numpy())




In [41]:
predictions = pd.DataFrame(preds)

In [42]:
predictions['ids'] = predictions.index

In [43]:
cols = predictions.columns.tolist()
cols[0], cols[1] = cols[1], cols[0]
predictions = predictions[cols]

In [45]:
predictions.to_csv("predicted_test_labels.csv")

In [44]:
predictions

Unnamed: 0,ids,0
0,0,715
1,1,1147
2,2,141
3,3,255
4,4,799
...,...,...
4238,4238,1162
4239,4239,558
4240,4240,777
4241,4241,951
