In [8]:
# Before Hyper-parameter tuning 

import pandas as pd
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  # For evaluating accuracy
from tqdm import tqdm

# Load the data using pandas
file_path = 'cluster_new_raw_test.csv'
df = pd.read_csv(file_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Show first few rows
df.head()

# Check the distribution of target variable
y = df['Cluster']
y.value_counts()

# Define target column and features
target_column = 'Cluster'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply standardization (scaling) to the features
scaler = StandardScaler()  # Create the StandardScaler object
X = scaler.fit_transform(X)  # Apply scaling

# Split the data into train and test (using sklearn)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize the AdaBoost classifier
model = AdaBoostClassifier(
    n_estimators=100,  # Number of boosting stages
    learning_rate=1.0,  # Learning rate for the model
    random_state=42
)

# Training loop with batch processing
batch_size = 500
n_batches = len(X_train) // batch_size + (1 if len(X_train) % batch_size != 0 else 0)

for batch_idx in tqdm(range(n_batches), desc="Training Batches", unit="batch"):
    # Get the start and end indices for this batch
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(X_train))

    # Slice the training data for this batch
    X_batch = X_train[start_idx:end_idx]
    y_batch = y_train[start_idx:end_idx]

    # Train on this batch
    model.fit(X_batch, y_batch)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")


Training Batches: 100%|██████████| 677/677 [01:44<00:00,  6.45batch/s]


Final Model Accuracy: 0.9078


In [5]:
# After Hyper-parameter tuning 

import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score  # For evaluating accuracy
from tqdm import tqdm

# Load the data using pandas
file_path = 'cluster_new_raw_test.csv'
df = pd.read_csv(file_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Show first few rows
df.head()

# Define target column and features
target_column = 'Cluster'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply standardization (scaling) to the features
scaler = StandardScaler()  # Create the StandardScaler object
X = scaler.fit_transform(X)  # Apply scaling

# Split the data into train and test (using sklearn)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize the AdaBoost classifier (initial parameters)
model = AdaBoostClassifier(random_state=42)

# Hyperparameter grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

# Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Training loop with batch processing using the best model
batch_size = 500
n_batches = len(X_train) // batch_size + (1 if len(X_train) % batch_size != 0 else 0)

for batch_idx in tqdm(range(n_batches), desc="Training Batches", unit="batch"):
    # Get the start and end indices for this batch
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(X_train))

    # Slice the training data for this batch
    X_batch = X_train[start_idx:end_idx]
    y_batch = y_train[start_idx:end_idx]

    # Train on this batch using the best model found by GridSearchCV
    best_model.fit(X_batch, y_batch)

# Evaluate the final model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")


Fitting 3 folds for each of 18 candidates, totalling 54 fits




Best Hyperparameters: {'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 200}


Training Batches: 100%|██████████| 677/677 [03:32<00:00,  3.19batch/s]


Final Model Accuracy: 0.9072


In [7]:
#  Confusion Matrix (For Code with hyperparameter tuning)

from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[54665  5588]
 [ 4879 47686]]


In [1]:
from sklearn.metrics import roc_auc_score

# Confusion matrix values
TN = 54665
FP = 5588
FN = 4879
TP = 47686

# Calculate TPR and FPR
TPR = TP / (TP + FN)  # True Positive Rate (Recall)
FPR = FP / (FP + TN)  # False Positive Rate

# If you have the model predictions, you would typically calculate the AUC like this:
# y_pred_proba = model.predict_proba(X_test)[:, 1]
# auc = roc_auc_score(y_test, y_pred_proba)

# For simplicity, if we use a simplified approach with just TPR and FPR:
# We can calculate AUC using these values, assuming a binary classification problem.
# (Note: This approximation assumes you have the right TPR and FPR curve from your model.)

# AUC calculation (based on TPR and FPR, approximated from confusion matrix):
# For simplicity, use this general approach:
# Note that this is only an approximation if you don't have the whole ROC curve.

# We can approximate AUC using TPR and FPR values, but this isn't as accurate as having the full ROC curve.
auc = 0.5 * (1 + TPR - FPR)
print(f"AUC (approximation): {auc:.4f}")


AUC (approximation): 0.9072
