In [1]:
import sys

model_dir = '/Users/anastasiiashcherbakova/git_projects/masters_project/plasmids/2_models'
sys.path.append(model_dir)

from GANs_model_1 import *
import torch
import numpy as np
from torch.utils.data import DataLoader

In [2]:
max_length = 6000
vocab_size = 4
num_epochs = 100
noise_dim = 100
input_size = max_length * vocab_size
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

generator = Generator(noise_dim, max_length, vocab_size).to(device)
critic = Critic(input_size).to(device)  

In [3]:
print(f"Type of critic before loading state dict: {type(critic)}")

Type of critic before loading state dict: <class 'GANs_model_1.Critic'>


In [4]:
generator.load_state_dict(torch.load('/Users/anastasiiashcherbakova/Desktop/GANs_model_1_output/saved_base_generator_mod.pt', map_location=torch.device('cpu')))
critic.load_state_dict(torch.load('/Users/anastasiiashcherbakova/Desktop/GANs_model_1_output/saved_base_critic_mod.pt', map_location=torch.device('cpu')))

<All keys matched successfully>

In [5]:
generator.eval()
critic.eval()

Critic(
  (model): Sequential(
    (0): Linear(in_features=24000, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=256, bias=True)
    (4): LeakyReLU(negative_slope=0.2)
    (5): Linear(in_features=256, out_features=128, bias=True)
    (6): LeakyReLU(negative_slope=0.2)
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [6]:
first_linear_layer = generator.model[0]

latent_dim = first_linear_layer.weight.shape[1]
num_samples = 1000

random_latent_vectors = torch.randn(num_samples, latent_dim)

with torch.no_grad():
    generated_data = generator(random_latent_vectors)

generated_data = generated_data.cpu().numpy()

# for i, sequence in enumerate(generated_data):
#     print(f"Sequence {i+1}: {sequence}")

In [7]:
generated_data.shape

(1000, 6000, 4)

In [8]:
generated_data_tensor = torch.tensor(generated_data, dtype=torch.float32).to(device)


In [9]:
generated_data_tensor.shape

torch.Size([1000, 6000, 4])

In [10]:
generated_data_tensor

tensor([[[2.0834e-01, 2.1000e-01, 1.8734e-01, 3.9432e-01],
         [2.8949e-01, 1.9481e-01, 2.2155e-01, 2.9415e-01],
         [2.2642e-01, 2.9528e-01, 3.0901e-01, 1.6929e-01],
         ...,
         [4.5547e-01, 1.5534e-01, 1.9127e-01, 1.9791e-01],
         [4.1147e-01, 1.7227e-01, 1.9741e-01, 2.1885e-01],
         [4.1242e-01, 1.8735e-01, 1.8702e-01, 2.1322e-01]],

        [[2.1423e-01, 4.3143e-01, 1.6064e-02, 3.3827e-01],
         [8.7005e-02, 5.2664e-02, 5.1646e-01, 3.4387e-01],
         [7.5532e-01, 1.9749e-01, 3.9784e-02, 7.4133e-03],
         ...,
         [1.0000e+00, 1.5127e-07, 1.3712e-07, 3.2028e-09],
         [1.0000e+00, 7.7541e-09, 2.0147e-08, 1.4526e-07],
         [1.0000e+00, 1.6709e-08, 7.5519e-08, 1.2144e-07]],

        [[1.0257e-12, 2.7929e-05, 1.8980e-09, 9.9997e-01],
         [9.8974e-02, 1.3738e-07, 9.0099e-01, 4.0276e-05],
         [9.9356e-01, 1.3866e-06, 2.2711e-04, 6.2113e-03],
         ...,
         [1.0000e+00, 2.4374e-32, 1.7761e-33, 2.2182e-31],
         [

In [11]:
def tensor_to_sequences(tensor):
    nucleotide_map = {
        (1, 0, 0, 0): 'A',
        (0, 1, 0, 0): 'T',
        (0, 0, 1, 0): 'C',
        (0, 0, 0, 1): 'G',
        (0, 0, 0, 0): '',
    }
    
    rounded_tensor = torch.round(tensor)
    
    tensor_np = rounded_tensor.numpy()
    
    sequences = []
    for seq in tensor_np:
        sequence = ''.join([nucleotide_map[tuple(row)] for row in seq])
        sequences.append(sequence)
    
    return sequences

sequences = tensor_to_sequences(generated_data_tensor)

In [12]:
for seq in sequences:
    print(seq)


CAATATGGGTTATGCCCCACATGCTGTGCGAATGCTGATTCACCAATTTGAAGCAGGACATGAGTAGTAGCGGTAGCGGGAATAAGCAAGTACAGACATCTAGCTTGAACCGCGTCGACCATCGGCAAATCAATTCCCCAATCATGAAACTACACAGCAGATTTTATCGCTTCAATAGCGAAGACTCAAACATTGAACACAGCCCACACCTGGCCACGCGAATCCTACGTTTACCTACATCACTCCACCACACGATTCCCCTCCCGAAAGCGTAAGGATTAAAACTCATTCTATTTAGATACATCACGTCGCCCACGGCCGCTGCTGGTCTAAGGAACCATCCGGCCCTTCCGACCTGATCGAATACTCCACGCCTGCAGACACCAGTTTTACCACCCACAAGCTTCTCAATATGCCAACTTTCTCTGTTTGGCGCGCCACATATCTGCAGAGAGGAACCTCGAACGCTTATTAAAGATCACTAATCGCGGCTACTGCCCGGGTTACCTCATAAAGTACCCGATCCAGGCTGCATTTCACTTCCCGAAGCTGCGTTCCCACAAGTTTGCACATAAGTACTCCTCATCAAACCGCCCTGTGCCCAAGTGAACTTCCCTGAGCGTTCGGACCAAGGTGCTCCCTAGGGGCATGATTATGACACCGTCCGGCGTTTAGTACTGAAGCGGTAGTTAGAACCGCACCATTCGTCTTAATACACCGGCTCAACACTCGTGGACATGCTGCCCCCTTTGAAAGAGAAGCCTACCACCACGGTACGGAACATGTCCGCCCGTCCTGATCACATGCGTGAGCAAATGTAGTATTATATCTTGAGGCGTGAATTACGTGTTACTTTGAGCTTTTATCACACCAACGTCTAAGGTAGAACCTAAATCCCCCACTGAAAAGGCTGTCACTACGGTACCCTGTTATACAGGTCACGACCAGCACAAATAGATACATACTAATAGCGAGGAAATGTACAACATTAAAGCTTCG

In [None]:
generated_data_tensor_flattened = generated_data_tensor.view(generated_data_tensor.size(0), -1)


In [None]:
generated_data_tensor_flattened.shape

In [None]:
sequences = np.load('/Users/anastasiiashcherbakova/Desktop/cleaned_sequences.npy', allow_pickle=True)

In [None]:
base_tokenizer = Base_Level_Tokenizer()
base_vocab = base_tokenizer.fit_on_texts(sequences)

In [None]:
batch_size = 4
base_sequence_encoded = base_tokenizer.sequence_to_indices(sequences)

In [None]:
base_dataset = Dataset_Prep(base_sequence_encoded, max_length)
base_dataloader = DataLoader(base_dataset, batch_size=batch_size, shuffle=True)

In [None]:
real_data_batch = next(iter(base_dataloader)) 
real_data_batch = real_data_batch.to(device) 

In [None]:
real_data_batch

In [None]:
batch_size_actual = real_data_batch.size(0)
real_data_one_hot = one_hot_encode_sequences(real_data_batch, vocab_size)
real_data_one_hot = real_data_one_hot.view(batch_size_actual, -1)

In [None]:
real_data_one_hot

In [None]:
real_data_one_hot.shape

In [None]:
with torch.no_grad():
    real_scores = critic(real_data_one_hot)
    generated_scores = critic(generated_data_tensor_flattened)

# Compare the scores
real_scores_mean = real_scores.mean().item()
generated_scores_mean = generated_scores.mean().item()

print(f"Mean Critic Score for Real Data: {real_scores_mean}")
print(f"Mean Critic Score for Generated Data: {generated_scores_mean}")

In [5]:
import numpy as np
from scipy.interpolate import interp1d

In [6]:
sequences = np.load('/Users/anastasiiashcherbakova/Desktop/cleaned_sequences.npy', allow_pickle=True)

In [17]:
### Encodign the sequences
char_to_int = {'A': 0, 'T': 1, 'C': 2, 'G': 3}
split_sequences = [np.array(list(seq)) for seq in sequences]

In [21]:
type(split_sequences)

list

In [None]:
# Vectorize the mapping function
vectorized_map = np.vectorize(char_to_int.get)

# Apply the vectorized mapping to each sequence
int_sequences = vectorized_map(split_sequences)

print(f"int_sequences shape: {int_sequences.shape}")
print(int_sequences)

# # Reshape if needed
# int_sequences = int_sequences.reshape(-1, 1)
# print(f"int_sequences shape after reshaping: {int_sequences.shape}")
# print(int_sequences)

### Interpolate each sequence to the target length
target_length = 2560

interpolated_data = []
for seq in int_sequences:
    seq = list(seq)
    x = np.linspace(0, 1, len(seq))
    f = interp1d(x, seq, kind='linear') 
    x_new = np.linspace(0, 1, target_length)
    interpolated_seq = f(x_new)
    interpolated_data.append(interpolated_seq)

interpolated_data = np.array(interpolated_data)