In [2]:
import os,gc
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import glob
import torch
import warnings

# disable warnings
warnings.filterwarnings('ignore')

# Load Train data

In [3]:
PATH = '/kaggle/input/hms-harmful-brain-activity-classification/'
df = pd.read_csv(PATH + 'train.csv')
TARGETS = df.columns[-6:]
print('Train shape:', df.shape )
print('Targets', list(TARGETS))
df.head()

Train shape: (106800, 15)
Targets ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


# Create Non-Overlaping EEG Id Train Data

In [4]:
train = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spectrogram_id','min']

tmp = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_label_offset_seconds':'max'})
train['max'] = tmp

tmp = df.groupby('eeg_id')[['patient_id']].agg('first')
train['patient_id'] = tmp

tmp = df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values
    
y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train['target'] = tmp

train = train.reset_index()
print('Train non-overlap eeg_id shape:', train.shape )

train.head(20)

Train non-overlap eeg_id shape: (17089, 12)


Unnamed: 0,eeg_id,spectrogram_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,789577333,0.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,14960202,1008.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,618728447,908.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,52296320,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,1.0,Other
5,1629671,2036345030,0.0,160.0,37481,1.0,0.0,0.0,0.0,0.0,0.0,Seizure
6,1895581,128369999,1138.0,1138.0,47999,0.076923,0.0,0.0,0.0,0.076923,0.846154,Other
7,2061593,320962633,1450.0,1450.0,23828,0.0,0.0,0.0,0.0,0.0,1.0,Other
8,2078097,2074135650,3342.0,3342.0,61174,0.0,0.0,0.0,0.0,0.0,1.0,Other
9,2366870,1232582129,0.0,30.0,23633,0.0,0.333333,0.0,0.0,0.0,0.666667,Other


In [5]:
import numpy as np

ycol = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
cd = {'Seizure': 'seizure_vote', 'GPD': 'gpd_vote', 'LRDA': 'lrda_vote', 'Other': 'other_vote', 'GRDA': 'grda_vote', 'LPD': 'lpd_vote'}

# Extract probability column and label column
eeg_id_col = train.iloc[:, 0]     # The first column is the eeg_id column
prob_cols = train.iloc[:, -7:-1]  # The seventh to last column is the probability column
label_col = train.iloc[:, -1]     # The last column is the label column

# Convert probability column to float32 
prob_cols = prob_cols.astype("float32")

# Normalized probability column
prob_cols_normalized = prob_cols.div(prob_cols.sum(axis=1), axis=0)

# Reassemble into DataFrame
normalized_data = pd.concat([eeg_id_col, prob_cols_normalized, label_col], axis=1)

normalized_data.head(20)

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,0.0,0.0,0.0,0.0,0.0,1.0,Other
5,1629671,1.0,0.0,0.0,0.0,0.0,0.0,Seizure
6,1895581,0.076923,0.0,0.0,0.0,0.076923,0.846154,Other
7,2061593,0.0,0.0,0.0,0.0,0.0,1.0,Other
8,2078097,0.0,0.0,0.0,0.0,0.0,1.0,Other
9,2366870,0.0,0.333333,0.0,0.0,0.0,0.666667,Other


In [6]:
normalized_data.to_csv("/kaggle/working/normalized_data.csv", index=False)

In [7]:
EEG_PATH = '/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/'
train_path = '/kaggle/working/normalized_data.csv'

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import glob
from scipy.signal import butter, sosfilt
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

import pandas as pd

class EEGDataset(Dataset):
    def __init__(self, csv_file, eeg_path):
        self.csv = pd.read_csv(csv_file)  # Load CSV file
        self.eeg_path = eeg_path
        self.sos = self.butter_bandpass_filter_init()
        self.FEATS = ['Fp1','T3','C3','O1','Fp2','C4','T4','O2', 'F7','F3', 'Fz', 'F4', 'F8','Cz','T5','P3','Pz','P4','T6']  # Specify the feature columns to be used

    def butter_bandpass_filter_init(self):
        lowcut = 0.5    # Set the low-frequency cutoff frequency of the bandpass filter
        highcut = 40.0  # Set the high-frequency cutoff frequency of the bandpass filter
        freq = 200.0    # Sampling frequency
        order = 5       # Filter order

        nyq = 0.5 * freq
        low = lowcut / nyq
        high = highcut / nyq
        sos = butter(order, [low, high], analog=False, btype='band', output='sos')
        return sos

    def butter_bandpass_filter(self, data):
        y = sosfilt(self.sos, data)
        return y
    
    def __len__(self):
        return len(self.csv)
    
    def __getitem__(self, idx):
        eeg_id = self.csv.loc[idx, 'eeg_id']
        eeg_file_path = f"{self.eeg_path}{eeg_id}.parquet" # Path to build EEG data files
        # Load specified feature columns from Parquet file
        eeg_data = pd.read_parquet(eeg_file_path)[self.FEATS].values 
        # Check if has NaN values
        if np.isnan(eeg_data).any():
            # If a NaN value is present, you can choose to fill it with a specific value or interpolate it
            # Here we use SimpleImputer for simple filling processing, replacing NaN values with mean values
            imputer = SimpleImputer(strategy='mean')
            eeg_data = imputer.fit_transform(eeg_data)
        eeg_data = self.butter_bandpass_filter(eeg_data)        # Filter the data
        eeg_data = torch.tensor(eeg_data, dtype=torch.float32)  # Convert to PyTorch tensor
                    
        # Select data from the middle 10,000 time points
        mid_index = eeg_data.shape[0] // 2
        start_index = mid_index - 5000  # Shift 5000 time points to the left from the center
        end_index = mid_index + 5000  # Shift 5000 time points to the right from the center
        eeg_data = eeg_data[start_index:end_index]
        # Swap dimension positions
        eeg_data = torch.transpose(eeg_data, 0, 1)
        
        # Load the corresponding tag
        labels = torch.tensor(self.csv.loc[idx, ycol].values.astype(np.float32), dtype=torch.float32)  
        #labels = labels.unsqueeze(0).expand(eeg_data.size(0), -1)  # 调整标签的尺寸与输出相匹配
        
        return eeg_data, labels

# Create an EEG dataset instance
dataset = EEGDataset(train_path, EEG_PATH)


# Create data loader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

In [13]:
X, y = dataset[0]
print(f"Sample {0 + 1}: X shape {X.shape}, y shape {y.shape}")

Sample 1: X shape torch.Size([19, 10000]), y shape torch.Size([6])


# CNN+LSTM Model

In [14]:
import torch.nn.functional as F

class CNNLSTM(nn.Module):
    def __init__(self, in_channels=19, num_classes=6):
        
        super(CNNLSTM, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(32)
        self.relu1 = nn.ReLU(inplace=True)
        self.dropout1 = nn.Dropout(p=0.5)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU(inplace=True)
        self.dropout2 = nn.Dropout(p=0.75)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.lstm1 = nn.LSTM(input_size=64, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(input_size=256, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)

        self.attention = nn.Sequential(
            nn.Linear(128 * 2, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

        self.fc = nn.Linear(128 * 2, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.pool2(x)

        # reshape for LSTM
        batch_size, channels, seq_length = x.size()
        x = x.permute(0, 2, 1)

        # First LSTM layer
        x, _ = self.lstm1(x)

        # Second LSTM layer
        x, _ = self.lstm2(x)
        
        # Attention layer
        att_weights = F.softmax(self.attention(x), dim=1)
        x = torch.sum(att_weights * x, dim=1)

        # Fully connected layer
        x = self.fc(x)

        return x


In [15]:
# Define some hyperparameters
input_channels = 19
num_classes = 6  

# Create an EEGNet instance
model = CNNLSTM(in_channels=input_channels, num_classes=num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(dataloader):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(dataloader)}")

Epoch 1, Loss: 1.5641149536471501
Epoch 2, Loss: 1.5230671381281915
Epoch 3, Loss: 1.440500719970632
Epoch 4, Loss: 1.3911638005871638
Epoch 5, Loss: 1.3563048359389618


# Predict test data

In [3]:
test_path = '/kaggle/input/hms-harmful-brain-activity-classification/test.csv'
TEST_EEG_PATH = '/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/'

class TestEEGDataset(Dataset):
    def __init__(self, csv_file, eeg_path):
        self.csv = pd.read_csv(csv_file)  # Load the CSV file
        self.eeg_path = eeg_path
        self.sos = self.butter_bandpass_filter_init()  # Initialize the Butterworth filter parameters
        self.FEATS = ['Fp1','T3','C3','O1','Fp2','C4','T4','O2', 'F7','F3', 'Fz', 'F4', 'F8','Cz','T5','P3','Pz','P4','T6'] 
        
    def __len__(self):
        return len(self.csv)
    
    def butter_bandpass_filter_init(self):
        lowcut = 0.5  # Set the low-frequency cutoff for bandpass filtering
        highcut = 45.0  # Set the high-frequency cutoff for bandpass filtering
        freq = 200.0  # Sampling frequency
        order = 5  # Filter order

        nyq = 0.5 * freq
        low = lowcut / nyq
        high = highcut / nyq
        sos = butter(order, [low, high], analog=False, btype='band', output='sos')  # Create second-order sections for the Butterworth filter
        return sos

    def butter_bandpass_filter(self, data):
        y = sosfilt(self.sos, data)
        return y

    def __getitem__(self, idx):
        eeg_id = self.csv.loc[idx, 'eeg_id']
        eeg_file_path = f"{self.eeg_path}{eeg_id}.parquet"  # Build the EEG data file path
        eeg_data = pd.read_parquet(eeg_file_path)[self.FEATS].values  # Load EEG data from Parquet file

        eeg_data = self.butter_bandpass_filter(eeg_data)  # Apply filtering to the data
        eeg_data = torch.tensor(eeg_data, dtype=torch.float32)  # Convert to PyTorch tensor
         
        # Select 10,000 data points from the middle
        mid_index = eeg_data.shape[0] // 2
        start_index = mid_index - 5000  # Offset 5000 data points to the left from the middle
        end_index = mid_index + 5000  # Offset 5000 data points to the right from the middle
        eeg_data = eeg_data[start_index:end_index]
        # Swap dimensions
        eeg_data = torch.transpose(eeg_data, 0, 1)
        
        return eeg_data


testdataset = TestEEGDataset(test_path, TEST_EEG_PATH)

test_dataloader = DataLoader(testdataset, batch_size=32, shuffle=True, num_workers=4)


model.eval()

predictions = []

with torch.no_grad():
    for inputs in test_dataloader:  #  labels
        inputs = inputs.to(device)
        outputs = model(inputs)
        
        probabilities = torch.softmax(outputs, dim=1)
        predictions.append(probabilities.cpu().numpy())

# the prediction results, converting them into probabilities
predictions = np.concatenate(predictions, axis=0)

# Output prediction results
print(predictions)

NameError: name 'Dataset' is not defined

In [None]:
# Create DataFrame
columns = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
results_df = pd.DataFrame(predictions, columns=columns)

# Print DataFrame
print(results_df)

In [None]:
test_data = pd.read_csv(test_path)
sub = pd.DataFrame({'eeg_id':test_data.eeg_id.values})
sub[TARGETS] = results_df
sub.to_csv('/kaggle/working/submission.csv',index=False)
print('Submissionn shape',sub.shape)
sub.head()

In [None]:
### Validate dimensions
sub.iloc[:,-6:].sum(axis=1)