In [10]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import itertools

# Load the data using pandas
file_path = 'cluster_new_raw_test.csv'
df = pd.read_csv(file_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Define target column and features
target_column = 'Cluster'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply standardization (scaling) to the features
scaler = StandardScaler()  # Create the StandardScaler object
X = scaler.fit_transform(X)  # Apply scaling

# Convert to Torch tensors (necessary for GPU-based operations)
X = torch.tensor(X, dtype=torch.float32).cuda()  # Move the features to GPU
y = torch.tensor(y.values, dtype=torch.long).cuda()  # Move target to GPU

# Split the data into train and test (using sklearn)
X_train, X_test, y_train, y_test = train_test_split(X.cpu().numpy(), y.cpu().numpy(), test_size=0.25, random_state=42)

# Convert back to Torch tensors after train-test split
X_train = torch.tensor(X_train, dtype=torch.float32).cuda()
X_test = torch.tensor(X_test, dtype=torch.float32).cuda()
y_train = torch.tensor(y_train, dtype=torch.long).cuda()
y_test = torch.tensor(y_test, dtype=torch.long).cuda()

# Define a simple linear model similar to SGDClassifier
class SGDModel(nn.Module):
    def __init__(self, input_dim):
        super(SGDModel, self).__init__()
        self.fc = nn.Linear(input_dim, 2)  # Binary classification

    def forward(self, x):
        return self.fc(x)

# Function for training the model and evaluating accuracy
def train_and_evaluate_model_accuracy(learning_rate, num_epochs):
    # Initialize the model
    model = SGDModel(X_train.shape[1]).cuda()

    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()  # CrossEntropy for classification
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in tqdm(range(num_epochs), desc="Training Epochs", unit="epoch"):
        model.train()
        optimizer.zero_grad()  # Clear gradients
        outputs = model(X_train)  # Forward pass
        loss = criterion(outputs, y_train)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

    # Evaluate the model
    model.eval()  # Switch to evaluation mode
    with torch.no_grad():
        y_pred = model(X_test)  # Make predictions
        _, predicted = torch.max(y_pred, 1)  # Get predicted class labels

    # Evaluate accuracy
    accuracy = accuracy_score(y_test.cpu().numpy(), predicted.cpu().numpy())  # Convert back to NumPy for sklearn
    return accuracy, model, predicted  # Return accuracy, model, and predictions for further evaluation

# Hyperparameter grid for grid search
learning_rates = [0.001, 0.01, 0.1]
num_epochs_list = [50, 100, 150]

# Create a cartesian product of all hyperparameter combinations
param_combinations = list(itertools.product(learning_rates, num_epochs_list))

# Initialize variables to track the best hyperparameters
best_accuracy = 0
best_params = None
best_model = None
best_predictions = None

# Perform grid search
for lr, epochs in param_combinations:
    print(f"Training with learning_rate={lr} and num_epochs={epochs}")
    accuracy, model, predicted = train_and_evaluate_model_accuracy(lr, epochs)
    print(f"Accuracy: {accuracy:.4f}")

    # Track the best model based on accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = (lr, epochs)
        best_model = model
        best_predictions = predicted

# Output the best hyperparameters
print(f"Best Hyperparameters: Learning Rate={best_params[0]}, Epochs={best_params[1]}")
print(f"Best Model Accuracy: {best_accuracy:.4f}")


Training with learning_rate=0.001 and num_epochs=50


Training Epochs: 100%|██████████| 50/50 [00:00<00:00, 807.50epoch/s]


Accuracy: 0.4604
Training with learning_rate=0.001 and num_epochs=100


Training Epochs: 100%|██████████| 100/100 [00:00<00:00, 558.25epoch/s]


Accuracy: 0.7966
Training with learning_rate=0.001 and num_epochs=150


Training Epochs: 100%|██████████| 150/150 [00:00<00:00, 200.24epoch/s]


Accuracy: 0.5335
Training with learning_rate=0.01 and num_epochs=50


Training Epochs: 100%|██████████| 50/50 [00:00<00:00, 1082.10epoch/s]


Accuracy: 0.5924
Training with learning_rate=0.01 and num_epochs=100


Training Epochs: 100%|██████████| 100/100 [00:00<00:00, 581.56epoch/s]


Accuracy: 0.7511
Training with learning_rate=0.01 and num_epochs=150


Training Epochs: 100%|██████████| 150/150 [00:00<00:00, 198.36epoch/s]


Accuracy: 0.8608
Training with learning_rate=0.1 and num_epochs=50


Training Epochs: 100%|██████████| 50/50 [00:00<00:00, 1054.96epoch/s]


Accuracy: 0.9053
Training with learning_rate=0.1 and num_epochs=100


Training Epochs: 100%|██████████| 100/100 [00:00<00:00, 569.22epoch/s]


Accuracy: 0.9068
Training with learning_rate=0.1 and num_epochs=150


Training Epochs: 100%|██████████| 150/150 [00:00<00:00, 200.65epoch/s]


Accuracy: 0.9061
Best Hyperparameters: Learning Rate=0.1, Epochs=100
Best Model Accuracy: 0.9068


In [None]:
# Function to evaluate the model on all metrics
def evaluate_all_metrics(model, best_predictions):
    model.eval()  # Switch to evaluation mode

    # Convert predictions and targets back to CPU and numpy for metric calculations
    y_test_cpu = y_test.cpu().numpy()
    predicted_cpu = best_predictions.cpu().numpy()

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test_cpu, predicted_cpu)
    precision = precision_score(y_test_cpu, predicted_cpu, average='weighted')
    recall = recall_score(y_test_cpu, predicted_cpu, average='weighted')
    f1 = f1_score(y_test_cpu, predicted_cpu, average='weighted')
    cm = confusion_matrix(y_test_cpu, predicted_cpu)

    # Displaying  metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}")
    print(f"F1 Score (weighted): {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)

evaluate_all_metrics(best_model, best_predictions)


Accuracy: 0.9068
Precision (weighted): 0.9073
Recall (weighted): 0.9068
F1 Score (weighted): 0.9068
Confusion Matrix:
[[54204  6049]
 [ 4468 48097]]
