In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [15]:
train_file = "../data/emvic/train.csv"  # Path to train CSV
test_file = "../data/emvic/test.csv"  # Path to test CSV
label_column = "class"

df = pd.read_csv(train_file)
df

Unnamed: 0,class,lx0,lx1,lx2,lx3,lx4,lx5,lx6,lx7,lx8,...,ry2038,ry2039,ry2040,ry2041,ry2042,ry2043,ry2044,ry2045,ry2046,ry2047
0,25,74,122,127,122,119,115,112,106,99,...,646,634,630,622,617,615,611,607,605,602
1,31,-14,-4,14,-3,11,0,6,6,1,...,383,392,383,386,389,381,388,384,385,387
2,12,-10,11,8,10,11,10,8,12,11,...,198,198,198,200,201,199,199,199,197,199
3,12,114,168,167,171,166,172,166,172,172,...,201,202,198,200,198,197,197,194,197,192
4,12,-13,11,11,10,12,17,16,20,21,...,-365,-368,-369,-369,-366,-372,-371,-371,-372,-374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,12,30,65,64,64,61,68,62,63,66,...,118,122,119,119,117,117,118,114,119,117
648,21,80,119,120,118,110,106,106,105,104,...,-94,-89,-88,-85,-83,-84,-86,-88,-90,-88
649,33,28,76,70,68,76,63,78,64,75,...,79,77,79,77,76,82,78,82,77,86
650,12,-62,-58,-59,-57,-57,-57,-59,-55,-59,...,193,194,191,192,189,187,188,186,186,185


In [25]:
unique = list(df["class"].unique())
unique.sort()
unique

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37]

In [31]:
na_counts = df.isna().sum()
na_counts, na_counts.sum()

(class     0
 lx0       0
 lx1       0
 lx2       0
 lx3       0
          ..
 ry2043    0
 ry2044    0
 ry2045    0
 ry2046    0
 ry2047    0
 Length: 8193, dtype: int64,
 0)

In [32]:
empty_counts = (df == '').sum()
empty_counts, empty_counts.sum()

(class     0
 lx0       0
 lx1       0
 lx2       0
 lx3       0
          ..
 ry2043    0
 ry2044    0
 ry2045    0
 ry2046    0
 ry2047    0
 Length: 8193, dtype: int64,
 0)

In [73]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from torch.utils.data import Dataset, DataLoader

# Custom dataset class
class TabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

# Preprocess data with feature reduction (SelectKBest)
# Preprocess data with feature reduction (SelectKBest)
def preprocess_data(train_file, test_file, label_column, missing_threshold=0.2, top_k_features=15):
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    # Handle column names with unexpected characters
    train_df.columns = train_df.columns.str.strip().str.replace(r'[^\w\s]', '', regex=True)
    test_df.columns = test_df.columns.str.strip().str.replace(r'[^\w\s]', '', regex=True)

    # Replace missing values
    train_df.replace('?', np.nan, inplace=True)
    test_df.replace('?', np.nan, inplace=True)

    # Handle columns with too many missing values
    valid_columns = train_df.columns[train_df.isnull().mean() < missing_threshold]
    if label_column not in valid_columns:
        valid_columns = valid_columns.insert(0, label_column)  # Ensure label column is retained

    train_df = train_df[valid_columns]
    test_df = test_df[valid_columns]

    # Fill missing values
    train_df.fillna(train_df.mean(), inplace=True)
    test_df.fillna(train_df.mean(), inplace=True)

    # Extract labels and features
    train_labels = train_df[label_column].values
    test_labels = test_df[label_column].values

    train_features = train_df.drop(columns=[label_column]).values
    test_features = test_df.drop(columns=[label_column]).values

    # Combine unique labels from train and test
    unique_labels = sorted(np.unique(np.concatenate([train_labels, test_labels])))
    label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

    # Map labels to integers
    train_labels = np.array([label_mapping[label] for label in train_labels])
    test_labels = np.array([label_mapping[label] for label in test_labels])

    # Ensure labels are within the correct range
    validate_labels(train_labels, len(unique_labels))
    validate_labels(test_labels, len(unique_labels))

    # Standardize features
    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_features)
    test_features = scaler.transform(test_features)

    # Feature selection using SelectKBest
    feature_selector = SelectKBest(f_classif, k=top_k_features)
    train_features = feature_selector.fit_transform(train_features, train_labels)
    test_features = feature_selector.transform(test_features)

    return train_features, train_labels, test_features, test_labels

# Define the model
class TabularModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TabularModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)  # Output dimension should be the number of unique labels
        )

    def forward(self, x):
        return self.fc(x)

# Train the model
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluate the model
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
    accuracy = correct / len(test_loader.dataset)
    return total_loss / len(test_loader), accuracy

# Main script
if __name__ == "__main__":
    train_file = "../data/emvic/train.csv"  # Path to train CSV
    test_file = "../data/emvic/test.csv"  # Path to test CSV
    label_column = "class"

    # Set device to CPU temporarily for debugging
    device = torch.device("cpu")
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Preprocess data
    train_features, train_labels, test_features, test_labels = preprocess_data(train_file, test_file, label_column)

    # Check if the labels are within the valid range
    print(f"Train labels unique values: {np.unique(train_labels)}")
    print(f"Test labels unique values: {np.unique(test_labels)}")

    # Create datasets and dataloaders
    train_dataset = TabularDataset(train_features, train_labels)
    test_dataset = TabularDataset(test_features, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    # Define model, loss, and optimizer
    input_dim = train_features.shape[1]  # Number of selected features
    output_dim = len(np.unique(train_labels)) + 1 # Number of unique labels

    model = TabularModel(input_dim, output_dim).to(device)  # Ensure the model is transferred to the correct device
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)




Using device: cpu


  test_df.replace('?', np.nan, inplace=True)


Train labels unique values: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37]
Test labels unique values: [18]


In [74]:
input_dim, output_dim, device

(15, 38, device(type='cpu'))

In [75]:
!export CUDA_LAUNCH_BLOCKING=1

In [70]:
print(f"Unique train labels: {np.unique(train_labels)}")
print(f"Unique test labels: {np.unique(test_labels)}")

Unique train labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37]
Unique test labels: [18]


In [77]:
# Train and evaluate the model
for epoch in range(10):  # Number of epochs
    #try:
    if True:
        train_loss = train_model(model, train_loader, criterion, optimizer, device)
        test_loss, test_accuracy = evaluate_model(model, test_loader, criterion, device)

        print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Test Loss = {test_loss:.4f}, Test Accuracy = {test_accuracy:.4f}")
    # except RuntimeError as e:
    #     print(f"Error during training: {e}")
    #     if "CUDA error" in str(e):
    #         print("Attempting to run on CPU due to CUDA error.")
    #         device = torch.device("cpu")
    #         model = model.to(device)
    #         continue

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [2]:
class TabularDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [None]:
# Load and preprocess data
def preprocess_data(file_path, label_column):
    df = pd.read_csv(file_path)
    
    # Separate features and labels
    labels = df[label_column].values
    features = df.drop(columns=[label_column])
    
    # Handle missing values (drop columns with too many missing values or impute)
    valid_columns = features.columns[features.isnull().mean() < 0.2]  # Keep columns with <20% missing
    features = features[valid_columns]
    features = features.fillna(features.mean())  # Fill missing values with column mean
    
    # Standardize features
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    
    return features, labels

In [None]:
# Define a simple feedforward neural network
class TabularModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TabularModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )
    
    def forward(self, x):
        return self.fc(x)


In [None]:
# Training function
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(train_loader)

In [None]:
# Evaluation function
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    return total_loss / len(test_loader), accuracy

In [None]:
# Main script
def main():
    # File paths and label column name
    train_file = "train.csv"
    test_file = "test.csv"
    label_column = "class"  # Replace with the actual label column name
    
    # Load and preprocess data
    train_features, train_labels = preprocess_data(train_file, label_column)
    test_features, test_labels = preprocess_data(test_file, label_column)
    
    # Create Datasets and DataLoaders
    train_dataset = TabularDataset(train_features, train_labels)
    test_dataset = TabularDataset(test_features, test_labels)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    # Model, loss, optimizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TabularModel(input_dim=train_features.shape[1], output_dim=len(np.unique(train_labels))).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training loop
    epochs = 20
    for epoch in range(epochs):
        train_loss = train_model(model, train_loader, criterion, optimizer, device)
        test_loss, test_accuracy = evaluate_model(model, test_loader, criterion, device)
        
        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
    
    print("Training complete!")



In [None]:
main()