In [None]:
!pip install xgboost 

In [None]:
!pip install xgboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Load the dataset
dataset_path = 'Ensemble_Techniques_XGBM_Data.csv'  # Ensure the correct dataset path
df = pd.read_csv(dataset_path)

# Display first five rows
print(df.head())

# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Handle missing values (if any)
df.fillna(df.median(numeric_only=True), inplace=True)

# Summary statistics
print(df.describe())

# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# Splitting dataset into features and target
if 'target' in df.columns:
    X = df.drop(columns=['target'])
    y = df['target']
else:
    raise ValueError("Target column not found in dataset. Please check column names.")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree as baseline model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

# Bagging Classifier
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test)
print("Bagging Accuracy:", accuracy_score(y_test, y_pred_bagging))

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

# Feature Importance from Random Forest
importances = rf.feature_importances_
feature_names = X.columns
feat_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print(feat_importance_df.head())

# Boosting Algorithms
# AdaBoost
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X_train, y_train)
y_pred_adaboost = adaboost.predict(X_test)
print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred_adaboost))

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))

# XGBoost with Hyperparameter Tuning
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2]
}
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best XGBoost model
y_pred_xgb = grid_search.best_estimator_.predict(X_test)
print("Best XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

# Model Comparison
models = ['Decision Tree', 'Bagging', 'Random Forest', 'AdaBoost', 'Gradient Boosting', 'XGBoost']
accuracies = [accuracy_score(y_test, y_pred_dt), accuracy_score(y_test, y_pred_bagging), accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_adaboost), accuracy_score(y_test, y_pred_gb), accuracy_score(y_test, y_pred_xgb)]

plt.figure(figsize=(10,6))
sns.barplot(x=models, y=accuracies, palette='viridis')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Performance Comparison')
plt.xticks(rotation=30)
plt.show()