In [3]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import metrics
import pandas as pd

# Load and preprocess the data
graduation_data = pd.read_csv('../../outputs/Assignment_3/final_selected_features_data.csv')
X = graduation_data.drop(columns=['Target'])
y = LabelEncoder().fit_transform(graduation_data['Target'])  # Encode the target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize results dictionary
results = {}

# Bagging with Random Forest
random_forest = RandomForestClassifier(
    n_estimators=100,  # Increase for better performance if needed
    max_depth=10,  # Adjust based on data complexity
    random_state=42
)
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
results['Random Forest'] = metrics.accuracy_score(y_test, y_pred_rf)
f1_rf = metrics.f1_score(y_test, y_pred_rf, average='weighted')

# Boosting with XGBoost
xg_boost = XGBClassifier(
    n_estimators=100,  # Increase for better performance if needed
    learning_rate=0.1,  # Adjust learning rate for optimization
    max_depth=6,  # Adjust based on data complexity
    random_state=42,
    use_label_encoder=False,  # Suppress warning for new XGBoost versions
    eval_metric='logloss'  # Set eval metric explicitly
)
xg_boost.fit(X_train, y_train)
y_pred_xgb = xg_boost.predict(X_test)
results['XGBoost'] = metrics.accuracy_score(y_test, y_pred_xgb)
f1_xgb = metrics.f1_score(y_test, y_pred_xgb, average='weighted')

# Display results
print(f"Random Forest Accuracy: {results['Random Forest'] * 100:.2f}%")
print(f"Random Forest F1 Score: {f1_rf * 100:.2f}%\n")
print(f"XGBoost Accuracy: {results['XGBoost'] * 100:.2f}%")
print(f"XGBoost F1 Score: {f1_xgb * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.



Random Forest Accuracy: 73.92%
Random Forest F1 Score: 70.92%

XGBoost Accuracy: 73.54%
XGBoost F1 Score: 71.76%
