In [1]:
import os
import pandas as pd
from nptdms import TdmsFile
from sklearn.preprocessing import MinMaxScaler
from src.utils import preprocess_logger

In [2]:
root_dir = '../data'
all_data = {}
labels = []
file_id = 0
for folder in sorted(os.listdir(root_dir)):
    folder_path = os.path.join(root_dir, folder)
    if os.path.isdir(folder_path):
        for file in sorted(os.listdir(folder_path)):
            if file.endswith('.tdms'):
                file_id += 1
                file_path = os.path.join(folder_path, file)
                preprocess_logger.info(f"Reading file: {file_path}")
                tdms_file = TdmsFile.read(file_path)
                
                data_dict = {}
                for channel in tdms_file.groups()[0].channels():
                    channel_name = channel.name
                    data_dict[channel_name] = channel[:]
                data = pd.DataFrame(data=data_dict)
                preprocess_logger.info(f"{file}: data to df.")
                
                label = file_id
                
                num_missing_before = data.isnull().sum().sum()
                data.dropna(inplace=True)
                num_missing_after = data.isnull().sum().sum()
                preprocess_logger.info(f"{num_missing_before} Null found, {num_missing_after} remaining after dropping rows.")
                
                string_columns = data.select_dtypes(include=['object']).columns
                preprocess_logger.info(f"Dropping string columns: {list(string_columns)}")
                data = data.drop(columns=string_columns)
                
                if 'Time Channel CNC' in data.columns:
                    preprocess_logger.info(f"Creating time-related features from 'Time Channel CNC'")
                    data['Time Channel CNC'] = pd.to_datetime(data['Time Channel CNC'])
                    data['year'] = data['Time Channel CNC'].dt.year
                    data['month'] = data['Time Channel CNC'].dt.month
                    data['day'] = data['Time Channel CNC'].dt.day
                    data['hour'] = data['Time Channel CNC'].dt.hour
                    data['minute'] = data['Time Channel CNC'].dt.minute
                    data['second'] = data['Time Channel CNC'].dt.second
                    data['millisecond'] = data['Time Channel CNC'].dt.microsecond
                    data = data.drop(columns=['Time Channel CNC'])
                    
                preprocess_logger.info(f"Normalizing data")
                scaler = MinMaxScaler()
                scaled_data = scaler.fit_transform(data)
                scaled_data = pd.DataFrame(data=scaled_data, columns=data.columns)
                preprocess_logger.info(f"Data normalization completed.")
                all_data[label] = scaled_data
                labels.append(label)
                preprocess_logger.info(f"df stored with label {label}")
preprocess_logger.info("All files processed.")

In [3]:
all_data[1].describe()

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import TimeSeriesSplit
from models.model import LSTMModel
from models.dataset import CNCDataset
import torch.backends.cudnn as cudnn
import matplotlib.pyplot as plt

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = next(iter(all_data.values())).shape[1]
hidden_size = 64
output_size = 2
num_layers = 3
dropout = 0.7
seq_length = 60
batch_size = 1024
num_epochs = 20
target_columns = [3, 5]
cudnn.benchmark = True
model = LSTMModel(input_size, hidden_size, output_size, num_layers, dropout)
model = nn.DataParallel(model.to(device))

In [6]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00005, weight_decay=1e-5)
scaler = GradScaler()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.2)
train_losses = []
val_losses = []
tscv = TimeSeriesSplit(n_splits=5)

In [7]:
for label, data in all_data.items():
    for fold, (train_index, val_index) in enumerate(tscv.split(data)):
        train_data = data.iloc[train_index]
        val_data = data.iloc[val_index]
        
        train_dataset = CNCDataset(train_data.values, seq_length, target_columns, augment=True)
        val_dataset = CNCDataset(val_data.values, seq_length, target_columns, augment=False)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, prefetch_factor=2, persistent_workers=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, prefetch_factor=2, persistent_workers=True)

        for epoch in range(num_epochs):
            model.train()
            running_loss = 0.0
            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)
                optimizer.zero_grad()
                
                with autocast():
                    outputs = model(inputs)
                    loss = criterion(outputs, targets)
                
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                
                running_loss += loss.item()
            
            train_losses.append(running_loss / len(train_loader))
            
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device, non_blocking=True), targets.to(device, non_blocking=True)
                    with autocast():
                        outputs = model(inputs)
                        loss = criterion(outputs, targets)
                    val_loss += loss.item()
            
            val_losses.append(val_loss / len(val_loader))
            scheduler.step(val_loss)
            
            print(f"Label {label}, Fold {fold+1}, Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Val Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()