In [1]:
import pandas as pd
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  
import warnings
import pickle  # Import pickle module for saving model

warnings.filterwarnings('ignore')  # Using to ignore all warnings 

# Load the data using pandas
file_path = 'cluster_new_raw_test.csv'
df = pd.read_csv(file_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Show first few rows
df.head()

# Check the distribution of target variable
y = df['Cluster']
y.value_counts()

# Define target column and features
target_column = 'Cluster'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply standardization (scaling) to the features
scaler = StandardScaler()  # Create the StandardScaler object
X = scaler.fit_transform(X)  # Apply scaling

# Split the data into train and test (using sklearn)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 0.1, 0.01],
    'scale_pos_weight': [1, 10, 25, 50]
}

# Initialize the XGBoost classifier
model = XGBClassifier(
    objective='binary:logistic',  # For binary classification; adjust if multi-class
    tree_method='gpu_hist',       # GPU optimized
    use_label_encoder=False,
    verbosity=1,
    predictor='gpu_predictor'
)

# Training loop with batch processing
batch_size = 3000 
n_batches = len(X_train) // batch_size + (1 if len(X_train) % batch_size != 0 else 0)

for batch_idx in tqdm(range(n_batches), desc="Training Batches", unit="batch"):
    # Get the start and end indices for this batch
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(X_train))

    # Slice the training data for this batch
    X_batch = X_train[start_idx:end_idx]
    y_batch = y_train[start_idx:end_idx]

    # Train on this batch
    model.fit(X_batch, y_batch)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")

# Save the model to a pickle file
with open('xgboost_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved as 'xgboost_model.pkl'")


Training Batches: 100%|██████████| 113/113 [00:34<00:00,  3.27batch/s]

Final Model Accuracy: 0.9456
Model saved as 'xgboost_model.pkl'



