In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  # For evaluating accuracy
from sklearn.model_selection import ParameterGrid  # For hyperparameter grid search
import numpy as np
from tqdm import tqdm

# Load the data using pandas
file_path = 'cluster_new_raw_test.csv'
df = pd.read_csv(file_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Show first few rows
df.head()

# Check the distribution of target variable
y = df['Cluster']
y.value_counts()

# Define target column and features
target_column = 'Cluster'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply standardization (scaling) to the features
scaler = StandardScaler()  # Create the StandardScaler object
X = scaler.fit_transform(X)  # Apply scaling

# Convert to numpy arrays for PyTorch
X = np.array(X)
y = np.array(y)

# Split the data into train and test (using sklearn)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)  # For classification, use long type labels
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the MLP model class
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLPModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)  # For multi-class classification (use Softmax)

    def forward(self, x):
        x = self.relu(self.fc1(x))  # First hidden layer
        x = self.fc2(x)  # Output layer
        return self.softmax(x)  # Apply softmax for classification

# Hyperparameter grid
param_grid = {
    'learning_rate': [0.001, 0.01],
    'batch_size': [256, 512, 1024],
    'hidden_size': [50, 100, 200],
    'epochs': [10, 20],
    'optimizer': ['adam', 'sgd']
}

# Function to train and evaluate model with given hyperparameters
def train_and_evaluate_model(lr, batch_size, hidden_size, epochs, optimizer_type):
    # Initialize the model
    input_size = X_train.shape[1]
    output_size = len(np.unique(y))  # Number of classes (Cluster values)
    model = MLPModel(input_size, hidden_size, output_size).to(device)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()  # For multi-class classification
    
    if optimizer_type == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=lr)
    elif optimizer_type == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=lr)

    # Training loop
    model.train()
    for epoch in range(epochs):
        # Shuffle training data for each epoch
        permutation = torch.randperm(X_train_tensor.size()[0])

        for i in range(0, X_train_tensor.size()[0], batch_size):
            # Get batch indices
            indices = permutation[i:i+batch_size]
            X_batch = X_train_tensor[indices].to(device)
            y_batch = y_train_tensor[indices].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(X_batch)

            # Compute loss
            loss = criterion(outputs, y_batch)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        y_pred_tensor = model(X_test_tensor.to(device))
        y_pred = torch.argmax(y_pred_tensor, dim=1).cpu().numpy()  # Convert predictions to numpy array

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, model

# Hyperparameter tuning
best_accuracy = 0
best_params = {}
best_model = None  # Variable to store the best model

# Grid search loop
for params in tqdm(ParameterGrid(param_grid), desc="Hyperparameter Tuning"):
    lr = params['learning_rate']
    batch_size = params['batch_size']
    hidden_size = params['hidden_size']
    epochs = params['epochs']
    optimizer_type = params['optimizer']
    
    accuracy, model = train_and_evaluate_model(lr, batch_size, hidden_size, epochs, optimizer_type)
    
    # Track the best hyperparameters and accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params
        best_model = model  # Save the best model

# Output the best hyperparameters and the corresponding accuracy
print(f"Best Accuracy: {best_accuracy:.4f}")
print("Best Hyperparameters:")
print(best_params)

# Save the best model using torch.save
model_save_path = 'best_mlp_model.pth'  # You can change this path if needed
torch.save(best_model.state_dict(), model_save_path)
print(f"Best model saved to {model_save_path}")


Hyperparameter Tuning: 100%|██████████| 72/72 [23:55<00:00, 19.94s/it]

Best Accuracy: 0.9519
Best Hyperparameters:
{'batch_size': 1024, 'epochs': 20, 'hidden_size': 200, 'learning_rate': 0.01, 'optimizer': 'adam'}
Best model saved to best_mlp_model.pth



