In [1]:
from tdc.single_pred import ADME
import pandas as pd
import numpy as np
import re
from rdkit import Chem
from rdkit.Chem import Draw
from IPython.display import display, Image

In [2]:
data = ADME(name = 'Caco2_Wang')
df = data.get_data()
splits = data.get_split()

Downloading...
100%|█████████████████████████████████████| 82.5k/82.5k [00:00<00:00, 7.08MiB/s]
Loading...
Done!


In [3]:
train = splits['train']
valid = splits['valid']
test = splits['test']

In [4]:
df = pd.concat((train, test, valid), axis=0).reset_index(drop=True)

In [5]:
# from tdc.benchmark_group import admet_group
# group = admet_group(path = 'data/')
# predictions_list = []

# for seed in [1, 2, 3, 4, 5]:
#     benchmark = group.get('Caco2_Wang') 
#     # all benchmark names in a benchmark group are stored in group.dataset_names
#     predictions = {}
#     name = benchmark['name']
#     train_val, test = benchmark['train_val'], benchmark['test']
#     train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)
    
#         # --------------------------------------------- # 
#         #  Train your model using train, valid, test    #
#         #  Save test prediction in y_pred_test variable #
#         # --------------------------------------------- #
        
#     predictions[name] = y_pred_test
#     predictions_list.append(predictions)

# results = group.evaluate_many(predictions_list)
# # {'caco2_wang': [6.328, 0.101]}


In [6]:
column_name = 'Drug'

# Define the file path for the text file
file_path = 'Drug.txt'

# Extract the column data
column_data = train[column_name]

# Write column data to text file
column_data.to_csv(file_path, header=False, index=False)

In [7]:
import deepsmiles
print("DeepSMILES version: %s" % deepsmiles.__version__)
converter = deepsmiles.Converter(rings=True, branches=True)
print(converter) # record the options used

encoded = converter.encode(train['Drug'][0])
print("Encoded: %s" % encoded)

try:
    decoded = converter.decode(encoded)
except deepsmiles.DecodeError as e:
    decoded = None
    print("DecodeError! Error message was '%s'" % e.message)

if decoded:
    print("Decoded: %s" % decoded)

DeepSMILES version: 1.0.1
Converter(rings=True, branches=True)
Encoded: OcccO)ccc6)OCccccO)cO)c6))))))CO)C6
Decoded: Oc1cc(O)c3c(c1)OC(c2ccc(O)c(O)c2)C(O)C3


In [8]:
import pandas as pd
from rdkit import Chem

def tokenize_smiles(df, smiles_col='Drug'):
    df['Tokens'] = df[smiles_col].apply(lambda x: _tokenize_smiles_legacy(Chem.MolFromSmiles(str(x))))
    return df

def _tokenize_smiles_legacy(mol):

    tokens = []
    for atom in mol.GetAtoms():
        tokens.append(f"A:{atom.GetSymbol()}")
    
    for bond in mol.GetBonds():
        begin, end, bond_type = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx(), bond.GetBondType()
        tokens.append(f"B:{begin}-{end}-{bond_type}")
    return tokens

In [9]:
train

Unnamed: 0,Drug_ID,Drug,Y
0,(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.22
1,"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.86
2,codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...,-4.09
3,danazol,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,-4.84
4,dexamethasone b D glucuronide,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,-6.12
...,...,...,...
632,13h,CN1C(=O)CC(N2CCCN(CCCN3c4ccccc4CCc4ccc(C(=O)O)...,-5.36
633,(Z)-19f,CN1C(=O)CC(N2CCCN(CC/C=C3/c4ccccc4CCc4ccc(CC(=...,-5.32
634,Ac-C8-Enk-NH (3),CCCCCC[C@H](NC(C)=O)C(=O)N[C@@H](Cc1ccc(O)cc1)...,-5.97
635,Apometzgerin (2),COc1cc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc(O)c1OC,-4.95


In [10]:
# Example usage
df = tokenize_smiles(df)

train = tokenize_smiles(train)
test = tokenize_smiles(test)
valid = tokenize_smiles(valid)

In [11]:
df

Unnamed: 0,Drug_ID,Drug,Y,Tokens
0,(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.220000,"[A:O, A:C, A:C, A:C, A:O, A:C, A:C, A:C, A:O, ..."
1,"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.860000,"[A:C, A:C, A:C, A:C, A:C, A:C, A:C, A:C, A:C, ..."
2,codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...,-4.090000,"[A:C, A:O, A:C, A:C, A:C, A:C, A:C, A:C, A:O, ..."
3,danazol,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,-4.840000,"[A:C, A:C, A:C, A:O, A:C, A:C, A:C, A:C, A:C, ..."
4,dexamethasone b D glucuronide,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,-6.120000,"[A:C, A:C, A:C, A:C, A:C, A:C, A:C, A:C, A:C, ..."
...,...,...,...,...
905,atropine,CN1[C@H]2CC[C@@H]1CC(OC(=O)C(CO)c1ccccc1)C2,-4.700000,"[A:C, A:N, A:C, A:C, A:C, A:C, A:C, A:C, A:O, ..."
906,Guanabenz,NC(N)=NN=Cc1c(Cl)cccc1Cl,-4.330000,"[A:N, A:C, A:N, A:N, A:N, A:C, A:C, A:C, A:Cl,..."
907,4,CN(C(=O)[C@H](Cc1ccc(CN)cc1)NS(=O)(=O)c1ccc2cc...,-4.958607,"[A:C, A:N, A:C, A:O, A:C, A:C, A:C, A:C, A:C, ..."
908,20(S)-camptothecin (CPT),CC[C@]1(O)C(=O)OCc2c1cc1n(c2=O)-c2cc3ccccc3nc2C1,-4.331849,"[A:C, A:C, A:C, A:O, A:C, A:O, A:O, A:C, A:C, ..."


In [12]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
def convert_to_string(lst):
    return ' '.join(lst)

# Apply the function to the column
train['Tokens2'] = train['Tokens'].apply(convert_to_string)
test['Tokens2'] = test['Tokens'].apply(convert_to_string)
valid['Tokens2'] = valid['Tokens'].apply(convert_to_string)
df['Tokens2'] = df['Tokens'].apply(convert_to_string)


In [14]:
# from collections import Counter


# def custom_vectorizer(sequence, elements):
#     counts = Counter(sequence)
#     vector = [counts[element] for element in elements]
#     return np.array(vector)


# def apply_custom_vectorizer(df, column_name):
#     # Extract unique elements from the specified column
#     unique_elements = sorted(set([element for sublist in df[column_name] for element in sublist]))
    
#     # Vectorize each sequence in the specified column
#     vectorized_sequences = [custom_vectorizer(seq, unique_elements) for seq in df[column_name]]
    
#     # Create a DataFrame with the vectorized sequences
#     vectorized_df = pd.DataFrame(vectorized_sequences, columns=unique_elements)
    
#     # Concatenate the vectorized DataFrame with the original DataFrame
#     df = pd.concat([df, vectorized_df], axis=1)
    
#     return df

# # Apply the custom vectorizer to the 'sequences' column
# train = apply_custom_vectorizer(train, 'Tokens')
# test = apply_custom_vectorizer(test, 'Tokens')
# valid = apply_custom_vectorizer(valid, 'Tokens')

In [15]:
# X = train.drop(['Drug_ID', 'Drug', 'Y', 'Tokens'], axis=1)
# y = np.array(train['Y'])

In [16]:
# Tokenization and Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer()
# vectorizer = TfidfVectorizer()


X = vectorizer.fit_transform(df['Tokens2'])

y = np.array(df['Y'])

# X_test = vectorizer.fit_transform(test['Tokens'])
# y_test = np.array(test['Y'])

# X_valid = vectorizer.fit_transform(valid['Tokens'])
# y_valid = np.array(valid['Y'])

In [17]:
df

Unnamed: 0,Drug_ID,Drug,Y,Tokens,Tokens2
0,(-)-epicatechin,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2,-6.220000,"[A:O, A:C, A:C, A:C, A:O, A:C, A:C, A:C, A:O, ...",A:O A:C A:C A:C A:O A:C A:C A:C A:O A:C A:C A:...
1,"(2E,4Z,8Z)-N-isobutyldodeca-2,4,10-triene-8 -y...",C/C=C\C#CCC/C=C\C=C\C(=O)NCC(C)C,-3.860000,"[A:C, A:C, A:C, A:C, A:C, A:C, A:C, A:C, A:C, ...",A:C A:C A:C A:C A:C A:C A:C A:C A:C A:C A:C A:...
2,codeine,COc1ccc2c3c1O[C@H]1[C@@H](O)C=C[C@H]4[C@@H](C2...,-4.090000,"[A:C, A:O, A:C, A:C, A:C, A:C, A:C, A:C, A:O, ...",A:C A:O A:C A:C A:C A:C A:C A:C A:O A:C A:C A:...
3,danazol,C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=Cc5oncc5C[C@]4(...,-4.840000,"[A:C, A:C, A:C, A:O, A:C, A:C, A:C, A:C, A:C, ...",A:C A:C A:C A:O A:C A:C A:C A:C A:C A:C A:C A:...
4,dexamethasone b D glucuronide,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,-6.120000,"[A:C, A:C, A:C, A:C, A:C, A:C, A:C, A:C, A:C, ...",A:C A:C A:C A:C A:C A:C A:C A:C A:C A:C A:O A:...
...,...,...,...,...,...
905,atropine,CN1[C@H]2CC[C@@H]1CC(OC(=O)C(CO)c1ccccc1)C2,-4.700000,"[A:C, A:N, A:C, A:C, A:C, A:C, A:C, A:C, A:O, ...",A:C A:N A:C A:C A:C A:C A:C A:C A:O A:C A:O A:...
906,Guanabenz,NC(N)=NN=Cc1c(Cl)cccc1Cl,-4.330000,"[A:N, A:C, A:N, A:N, A:N, A:C, A:C, A:C, A:Cl,...",A:N A:C A:N A:N A:N A:C A:C A:C A:Cl A:C A:C A...
907,4,CN(C(=O)[C@H](Cc1ccc(CN)cc1)NS(=O)(=O)c1ccc2cc...,-4.958607,"[A:C, A:N, A:C, A:O, A:C, A:C, A:C, A:C, A:C, ...",A:C A:N A:C A:O A:C A:C A:C A:C A:C A:C A:C A:...
908,20(S)-camptothecin (CPT),CC[C@]1(O)C(=O)OCc2c1cc1n(c2=O)-c2cc3ccccc3nc2C1,-4.331849,"[A:C, A:C, A:C, A:O, A:C, A:O, A:O, A:C, A:C, ...",A:C A:C A:C A:O A:C A:O A:O A:C A:C A:C A:C A:...


In [18]:
df.iloc[0,1]

'Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2'

In [19]:
from IPython.display import display, Image

# Assuming df is your pandas DataFrame
for i in range(len(df)):
    smiles = df.iloc[i, 1]
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        img = Draw.MolToImage(mol)
        img_path = f'molecule_{i}.png'
        img.save(img_path)
        print(f'Saved {img_path}')
        display(Image(filename=img_path))
    else:
        print(f'Invalid SMILES at row {i}')

NameError: name 'Draw' is not defined

In [20]:
from PIL import Image

# Load a specific image
image_path = 'molecule_0.png'  # Change this to the path of the image you want to load
image = Image.open(image_path)
image.show()  # This will open the image using the default image viewer on your system

FileNotFoundError: [Errno 2] No such file or directory: 'molecule_0.png'

In [332]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os

class CustomDataset(Dataset):
    def __init__(self, data_dir, labels, transform=None):
        self.data_dir = data_dir
        self.labels = labels
        self.transform = transform

        # Load image paths
        self.image_paths = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.png')]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path)

        if self.transform:
            image = self.transform(image)

        label = self.labels[idx]
        
        return image, label

# Example labels array (replace this with your actual labels)
labels = df['Y']

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to a fixed size
    transforms.ToTensor(),          # Convert images to tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize images
])

# Get the current working directory
current_directory = os.getcwd()

# Define your dataset
dataset = CustomDataset(data_dir=current_directory, labels=labels, transform=transform)

# Define data loader
batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Iterate over the data loader
for images, labels in data_loader:
    # Process batch in your CNN model
    # 'images' will contain a tensor of shape (batch_size, channels, height, width)
    # 'labels' will contain the corresponding labels
    pass  # Replace this with your CNN model processing code


In [331]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x327ab35e0>

In [334]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, 1)  # Output is a single value for prediction

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(-1, 64 * 28 * 28)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize the model
model = CNN()

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error loss for regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, y in data_loader:
        optimizer.zero_grad()
        outputs = model(images)
        outputs = outputs.float()  # Convert to float if the model output is double
        loss = criterion(outputs, y.view(-1, 1).float())  # Reshape y to match output shape
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(data_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')

Epoch [1/100], Loss: 15.6310
Epoch [2/100], Loss: 0.7391
Epoch [3/100], Loss: 0.6077
Epoch [4/100], Loss: 0.6030
Epoch [5/100], Loss: 0.5955
Epoch [6/100], Loss: 0.5791
Epoch [7/100], Loss: 0.5738
Epoch [8/100], Loss: 0.6087
Epoch [9/100], Loss: 0.5105
Epoch [10/100], Loss: 0.4544
Epoch [11/100], Loss: 0.3880
Epoch [12/100], Loss: 0.3438
Epoch [13/100], Loss: 0.3290
Epoch [14/100], Loss: 0.2579
Epoch [15/100], Loss: 0.2212
Epoch [16/100], Loss: 0.1760
Epoch [17/100], Loss: 0.1594
Epoch [18/100], Loss: 0.1430
Epoch [19/100], Loss: 0.1255
Epoch [20/100], Loss: 0.1405
Epoch [21/100], Loss: 0.1212
Epoch [22/100], Loss: 0.0965
Epoch [23/100], Loss: 0.0907
Epoch [24/100], Loss: 0.0876
Epoch [25/100], Loss: 0.0794
Epoch [26/100], Loss: 0.0738
Epoch [27/100], Loss: 0.0715
Epoch [28/100], Loss: 0.0698
Epoch [29/100], Loss: 0.0734
Epoch [30/100], Loss: 0.0703
Epoch [31/100], Loss: 0.0633
Epoch [32/100], Loss: 0.0652
Epoch [33/100], Loss: 0.0860
Epoch [34/100], Loss: 0.0686
Epoch [35/100], Loss: 

KeyboardInterrupt: 

In [335]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 28 * 28, 512)
        self.fc2 = nn.Linear(512, 1)  # Output is a single value for prediction

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(-1, 64 * 28 * 28)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize the model
model = CNN()

# Define loss function and optimizer
criterion = nn.L1Loss()  # Mean Absolute Error loss for regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, y in data_loader:
        optimizer.zero_grad()
        outputs = model(images)
        outputs = outputs.float()  # Convert to float if the model output is double
        loss = criterion(outputs, y.view(-1, 1).float())  # Reshape y to match output shape
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    train_loss = running_loss / len(data_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')

Epoch [1/100], Loss: 2.2754
Epoch [2/100], Loss: 0.6699
Epoch [3/100], Loss: 0.6436
Epoch [4/100], Loss: 0.6555
Epoch [5/100], Loss: 0.6410
Epoch [6/100], Loss: 0.5698
Epoch [7/100], Loss: 0.5523
Epoch [8/100], Loss: 0.5147
Epoch [9/100], Loss: 0.5446
Epoch [10/100], Loss: 0.4831
Epoch [11/100], Loss: 0.4302
Epoch [12/100], Loss: 0.4382
Epoch [13/100], Loss: 0.4079
Epoch [14/100], Loss: 0.4410
Epoch [15/100], Loss: 0.4245
Epoch [16/100], Loss: 0.3909
Epoch [17/100], Loss: 0.3677
Epoch [18/100], Loss: 0.3626
Epoch [19/100], Loss: 0.3077
Epoch [20/100], Loss: 0.2849
Epoch [21/100], Loss: 0.3425
Epoch [22/100], Loss: 0.3253
Epoch [23/100], Loss: 0.2674
Epoch [24/100], Loss: 0.3120
Epoch [25/100], Loss: 0.2791
Epoch [26/100], Loss: 0.2458
Epoch [27/100], Loss: 0.2411
Epoch [28/100], Loss: 0.2335
Epoch [29/100], Loss: 0.2193
Epoch [30/100], Loss: 0.2331
Epoch [31/100], Loss: 0.2386
Epoch [32/100], Loss: 0.2199
Epoch [33/100], Loss: 0.2404
Epoch [34/100], Loss: 0.2448
Epoch [35/100], Loss: 0

KeyboardInterrupt: 

In [298]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.toarray()
X_test = X_test.toarray()


# Convert numpy arrays to PyTorch tensors
# X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
# X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Define the LSTM model
class LSTMPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
#         self.dropout = nn.Dropout(dropout)
#         self.batch_norm = nn.BatchNorm1d(hidden_size)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  
#         out = self.dropout(out)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

input_size = X_train_tensor.shape[1]
hidden_size = 256
num_layers = 2
output_size = 1  # Output a single value for prediction

model = LSTMPredictor(input_size, hidden_size, num_layers, output_size)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Train the model
num_epochs = 100
batch_size = 32

for epoch in range(num_epochs):
    for i in range(0, len(X_train_tensor), batch_size):
        inputs = X_train_tensor[i:i+batch_size].unsqueeze(1)  # Adjusting input dimensions
        labels = y_train_tensor[i:i+batch_size]
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 2 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(X_train_tensor)}], Loss: {loss.item():.4f}')

# Test the model
with torch.no_grad():
    outputs = model(X_test_tensor.unsqueeze(1))  # Adjusting input dimensions
    mse = criterion(outputs.squeeze(), y_test_tensor)
    print(f'Mean Squared Error on Test Set: {mse.item():.4f}')

Mean Squared Error on Test Set: 0.5690


In [252]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

# Assuming X_train, X_test, y_train, and y_test are defined

# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.long)  # Assuming input data is categorical and needs embedding
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.long)    # Assuming input data is categorical and needs embedding
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Define the LSTM model
class LSTMPredictor(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers, output_size):
        super(LSTMPredictor, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # Forward propagate LSTM
        out, _ = self.lstm(embedded, (h0, c0))
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

input_size = 150  # Assuming X_train contains categorical data
embedding_dim = 10  # Choose an appropriate embedding dimension
hidden_size = 256
num_layers = 3
output_size = 1

model = LSTMPredictor(input_size, embedding_dim, hidden_size, num_layers, output_size)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 50
batch_size = 32

for epoch in range(num_epochs):
    running_loss = 0.0
    for i in range(0, len(X_train_tensor), batch_size):
        inputs = X_train_tensor[i:i+batch_size]  # No need to unsqueeze as embedding layer expects input in 2D
        labels = y_train_tensor[i:i+batch_size]
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if (i+1) % 2 == 0:  # Print every 2nd step
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(X_train_tensor)}], Loss: {running_loss/2:.4f}')
            running_loss = 0.0  # Reset running loss
# Test the model
with torch.no_grad():
    outputs = model(X_test_tensor)
    mse = criterion(outputs.squeeze(), y_test_tensor)
    print(f'Mean Squared Error on Test Set: {mse.item():.4f}')


Mean Squared Error on Test Set: 0.6355
