In [1]:
# Conversion of code from TensorFlow to PyTorch

# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import random as rn
import joblib
from torch.utils.data import Dataset, DataLoader

In [2]:
# Preset parameters
RANDOM_SEED = 42
VALIDATE_SIZE = 0.2

In [3]:
# Setting random seeds to ensure reproducibility
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)

In [None]:
# # Define patients and their corresponding files
# patients = {
#     1: ['/kaggle/input/control-set-all/organized_fcs_data1a.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data1b.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data1c.csv'],
#     2: ['/kaggle/input/control-set-all/organized_fcs_data2a.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data2b.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data2c.csv'],
#     3: ['/kaggle/input/control-set-all/organized_fcs_data3a.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data3b.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data3c.csv'],
#     4: ['/kaggle/input/control-set-all/organized_fcs_data4a.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data4b.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data4c.csv'],
#     5: ['/kaggle/input/control-set-all/organized_fcs_data5a.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data5b.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data5c.csv'],
#     6: ['/kaggle/input/control-set-all/organized_fcs_data6a.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data6b.csv',
#         '/kaggle/input/control-set-all/organized_fcs_data6c.csv']
# }

In [4]:
# Define patients and their corresponding files
patients = {
    1: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data1a.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data1b.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data1c.csv'],
    2: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data2a.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data2b.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data2c.csv'],
    3: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data3a.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data3b.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data3c.csv'],
    4: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data4a.csv', 
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data4b.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data4c.csv'],    
    5: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data5a.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data5b.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data5c.csv'],
    6: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data6a.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data6b.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data6c.csv'],
    7: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data7.csv'],    
    8: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data8a.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data8b.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data8c.csv'],
    9: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data9a.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data9b.csv'],
    10: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data10a.csv',
         '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data10b.csv',
         '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data10c.csv'],
    11: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data11.csv'],  
    12: ['/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data12a.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data12b.csv',
        '/Users/subhadyutirath/Desktop/FlowCytometryPytorch/data/organized_fcs_data12c.csv']   
}

In [5]:
# Load, merge, and sample data for each patient
training_dfs = []
testing_dfs = []
i=0
for patient_id, file_paths in patients.items():
    # Load all three CSV files for the patient and drop the "Time" column
    if i < 5:
        patient_data = pd.concat([pd.read_csv(file).drop(columns=['Time'], errors='ignore') for file in file_paths])
        training_dfs.append(patient_data)
        i+=1
    else:
        patient_data = pd.concat([pd.read_csv(file).drop(columns=['Time'], errors='ignore') for file in file_paths])
        testing_dfs.append(patient_data)
    
train_full=pd.concat(training_dfs, ignore_index=True)
test_full=pd.concat(testing_dfs, ignore_index=True)

In [6]:
print(f"Training data samples: {len(training_dfs)}")
print(f"Testing data samples: {len(testing_dfs)}")

Training data samples: 5
Testing data samples: 7


In [None]:
# full_dataset = pd.concat(sampled_dfs)# Concatenate all sampled dataframes into a single dataframe


In [None]:
# # Shuffle the data
# shuffled_dataset = full_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# # Split into training (80%) and validation (20%)
# train_df, val_df = train_test_split(shuffled_dataset, test_size=0.2, random_state=42)

In [8]:
# Save the splits
train_full.to_csv('training_data.csv', index=False)
test_full.to_csv('validation_data.csv', index=False)

# # Print summary
# print(f"Total dataset size: {len(train_full)} cells")
print(f"Training set size: {len(train_full)} cells")
print(f"Validation set size: {len(test_full)} cells")

Training set size: 22406732 cells
Validation set size: 25469340 cells


In [9]:
# configure our pipeline
pipeline = Pipeline([('scaler', MinMaxScaler())])

pipeline.fit(train_full)

In [None]:
# Assuming you have already fit your pipeline on data
# pipeline.fit(X_train, y_train)

In [14]:
# Save the pipeline to a file
joblib.dump(pipeline, 'April_pipeline.pkl')

['April_pipeline.pkl']

In [13]:
# Now you can use this pipeline to transform new data or make predictions
# Load the pipeline from file
#pipeline = joblib.load('pipeline.pkl')

In [10]:
# transform the training and validation data with these parameters
X_train_transformed = pipeline.transform(train_full)
X_validate_transformed = pipeline.transform(test_full)

In [11]:
# data dimensions // hyperparameters 
input_dim = X_train_transformed.shape[1]
BATCH_SIZE = 256
EPOCHS = 100

In [12]:
class CustomDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = torch.tensor(dataset, dtype=torch.float32)
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return self.dataset[idx]

In [14]:
train_dataset = CustomDataset(X_train_transformed)
val_dataset = CustomDataset(X_validate_transformed)

In [15]:
train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, BATCH_SIZE, shuffle=False)

In [16]:
class AutoEncoder(nn.Module):
    def __init__(self,input_dim):
        super().__init__()
        # Defining the Encoder
        self.encoder=nn.Sequential(
            # Input_dim -> 16 -> 8 -> 4 -> 2
            nn.Linear(input_dim, 16),
            nn.ELU(),
            nn.Linear(16, 8),
            nn.ELU(),
            nn.Linear(8, 4),
            nn.ELU(),
            nn.Linear(4, 2),
            nn.ELU()
        )
        
        # Defining the Decoder
        self.decoder = nn.Sequential(
            nn.Linear(4, 8),
            nn.ELU(),
            nn.Linear(8, 16),
            nn.ELU(),
            nn.Linear(16, input_dim),
            nn.ELU(),
        )
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [17]:
autoencoder= AutoEncoder(input_dim=input_dim)

In [18]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters())

In [19]:
# Print model summary
print(autoencoder)
print(f"Total parameters: {sum(p.numel() for p in autoencoder.parameters())}")

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=14, out_features=16, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=16, out_features=8, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=8, out_features=4, bias=True)
    (5): ELU(alpha=1.0)
    (6): Linear(in_features=4, out_features=2, bias=True)
    (7): ELU(alpha=1.0)
  )
  (decoder): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=8, out_features=16, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=16, out_features=14, bias=True)
    (5): ELU(alpha=1.0)
  )
)
Total parameters: 844


In [None]:
# Training function
# def train_epoch(model, train_loader, criterion, optimizer):
#     model.train()
#     total_loss = 0
#     correct = 0
#     total = 0
    
#     for batch_idx, (data, target) in enumerate(train_loader):
        
#         optimizer.zero_grad()
#         output = model(data)
#         loss = criterion(output, target)
#         loss.backward()
#         optimizer.step()
        
#         total_loss += loss.item()
        
#         # Calculate accuracy (for reconstruction tasks, this is approximate)
#         pred = output.round()
#         correct += pred.eq(target.round()).sum().item()
#         total += target.numel()
    
#     avg_loss = total_loss / len(train_loader)
#     accuracy = 100. * correct / total
#     return avg_loss, accuracy


In [None]:
# # Validation function
# def validate_epoch(model, val_loader, criterion):
#     model.eval()
#     total_loss = 0
#     correct = 0
#     total = 0
    
#     with torch.no_grad():
#         for data, target in val_loader:
#             output = model(data)
#             loss = criterion(output, target)
#             total_loss += loss.item()
            
#             # Calculate accuracy
#             pred = output.round()
#             correct += pred.eq(target.round()).sum().item()
#             total += target.numel()
    
#     avg_loss = total_loss / len(val_loader)
#     accuracy = 100. * correct / total
#     return avg_loss, accuracy

In [None]:
# # Early stopping class
# class EarlyStopping:
#     def __init__(self, patience=10, min_delta=0.0001, restore_best_weights=True):
#         self.patience = patience
#         self.min_delta = min_delta
#         self.restore_best_weights = restore_best_weights
#         self.best_loss = None
#         self.counter = 0
#         self.best_weights = None
        
#     def __call__(self, val_loss, model):
#         if self.best_loss is None:
#             self.best_loss = val_loss
#             if self.restore_best_weights:
#                 self.best_weights = model.state_dict().copy()
#         elif val_loss < self.best_loss - self.min_delta:
#             self.best_loss = val_loss
#             self.counter = 0
#             if self.restore_best_weights:
#                 self.best_weights = model.state_dict().copy()
#         else:
#             self.counter += 1
            
#         if self.counter >= self.patience:
#             if self.restore_best_weights and self.best_weights is not None:
#                 model.load_state_dict(self.best_weights)
#             return True
#         return False


In [None]:
# Initialize early stopping
# early_stopping = EarlyStopping(patience=10, min_delta=0.0001, restore_best_weights=True)

In [None]:
# # Create logs directory
# import random as rn
# from datetime import datetime
# import os
# yyyymmddHHMM = datetime.now().strftime('%Y%m%d%H%M')
# log_subdir = f'{yyyymmddHHMM}_batch{BATCH_SIZE}_layers{len(list(autoencoder.modules()))}'
# os.makedirs('logs', exist_ok=True)