In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score  # For evaluating accuracy
from sklearn.semi_supervised import LabelSpreading
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# Load the data using pandas
file_path = 'cluster_new_raw_test.csv'
df = pd.read_csv(file_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Show first few rows
df.head()

# Check the distribution of target variable
y = df['Cluster']
y.value_counts()

# Define target column and features
target_column = 'Cluster'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply standardization (scaling) to the features
scaler = StandardScaler()  # Create the StandardScaler object
X = scaler.fit_transform(X)  # Apply scaling

# Split the data into train and test (using sklearn)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# For Label Spreading, we need to handle unlabeled data.
# We'll simulate unlabeled data by marking a portion of the training labels as -1 (unlabeled)
# You can adjust this ratio as necessary.
n_unlabeled = int(len(y_train) * 0.5)  # Example: 50% unlabeled data
y_train_unlabeled = y_train.copy()

# Use positional indices, not the actual index values in the DataFrame
unlabeled_indices = np.random.choice(len(y_train), size=n_unlabeled, replace=False)

# Now, set the selected indices to -1 (unlabeled)
y_train_unlabeled.iloc[unlabeled_indices] = -1

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'kernel': ['knn', 'rbf'],  # Kernel types: 'knn' (k-nearest neighbors) or 'rbf' (Radial Basis Function)
    'gamma': [0.1, 0.5, 1.0],  # For 'rbf' kernel: regularization parameter
    'n_neighbors': [3, 5, 10],  # Number of neighbors for 'knn' kernel
    'max_iter': [50, 100, 200]  # Maximum number of iterations for optimization
}

# Initialize the LabelSpreading model
model = LabelSpreading()

# Set up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Perform grid search to find the best parameters
grid_search.fit(X_train, y_train_unlabeled)

# Display the best hyperparameters
print(f"Best hyperparameters: {grid_search.best_params_}")

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Training loop with batch processing
batch_size = 3000 
n_batches = len(X_train) // batch_size + (1 if len(X_train) % batch_size != 0 else 0)

for batch_idx in tqdm(range(n_batches), desc="Training Batches", unit="batch"):
    # Get the start and end indices for this batch
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(X_train))

    # Slice the training data for this batch
    X_batch = X_train[start_idx:end_idx]
    y_batch = y_train_unlabeled[start_idx:end_idx]

    # Train on this batch (for label spreading, we train on the entire data)
    best_model.fit(X_batch, y_batch)

# Evaluate the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best hyperparameters: {'gamma': 0.1, 'kernel': 'knn', 'max_iter': 50, 'n_neighbors': 5}


Training Batches: 100%|██████████| 113/113 [00:02<00:00, 48.84batch/s]


Final Model Accuracy: 0.9120


In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Calculate various evaluation metrics
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred)
# Print the evaluation results
print(f"Final Model Accuracy: {accuracy:.4f}")
print(f"Final Model Accuracy: {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted): {recall:.4f}")
print(f"F1 Score (weighted): {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Final Model Accuracy: 0.9120
Final Model Accuracy: 0.9120
Precision (weighted): 0.9120
Recall (weighted): 0.9120
F1 Score (weighted): 0.9120
Confusion Matrix:
[[55483  4770]
 [ 5161 47404]]
