<a href="https://colab.research.google.com/github/samiha-mahin/XGBoost-model-deploy-with-flask/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


df = pd.read_csv('resampled_SMOTEENN.csv')


print("Dataset shape:", df.shape)
print("First 5 rows:\n", df.head())
print("Class distribution:\n", df.iloc[:, -1].value_counts())
X = df.iloc[:, :-1]  # assuming the last column is the target
y = df.iloc[:, -1]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}


xgb_classifier = XGBClassifier(random_state=42)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=xgb_classifier,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,  # You can adjust the number of cross-validation folds
                           verbose=1,
                           n_jobs=-1)  # Use all available cores

grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Hyperparameters: {best_params}")
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy (Cross-Validation): {best_score:.4f}")

#Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.9}
#Best Accuracy (Cross-Validation): 0.7915

best_xgb_classifier = grid_search.best_estimator_
y_pred = best_xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy on the test set (Best Model): {accuracy:.4f}")
best_xgb_classifier = grid_search.best_estimator_
y_pred = best_xgb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy on the test set (Best Model): {accuracy:.4f}")

In [None]:
import joblib

# Save the model
joblib.dump(best_xgb_classifier, 'xgb_model.pkl')

# Save the scaler too
joblib.dump(scaler, 'scaler.pkl')
