In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler  
dt=pd.read_csv('processed_dataset.csv')
dt['ESI'] = dt['ESI'].apply(np.abs)

# Initialize a scaler to scale values to [0, 100]
scaler = MinMaxScaler(feature_range=(0, 100))
dt['ESI'] = scaler.fit_transform(dt['ESI'].values.reshape(-1, 1))
x=dt[['ESI','Atmospheric_Retention','Long_Term_Stability']]
y=dt['P_HABITABLE']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
class_counts = y_train.value_counts().to_dict()
max_count = max(class_counts.values())
weight_dict = {cls: max_count / count for cls, count in class_counts.items()}

# Map each training instance to its corresponding weight
sample_weights = y_train.map(weight_dict)
clf = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=500,         # Increase number of trees
    max_depth=7,              # Allow deeper splits
    learning_rate=0.03,       # Lower learning rate for stability
    min_child_weight=5,       # Increase to avoid overfitting on minority class noise
    subsample=0.85,           # Use 85% of data per tree to improve generalization
    colsample_bytree=0.85,    # Use 85% of features per tree
    gamma=2,                  # Require a higher loss reduction for splitting
    reg_lambda=2,             # Increase L2 regularization
    reg_alpha=1,              # Increase L1 regularization
    random_state=42,
    tree_method='hist',       # Fast histogram-based algorithm
    verbosity=1
)
clf.fit(x_train, y_train, sample_weight=sample_weights)

# Evaluate on training data
y_pred = clf.predict(x_test)
print("Testing accuracy Accuracy:", accuracy_score(y_test, y_pred))  
import pickle
filename = 'model.pkl'  # Choose a filename
with open(filename, 'wb') as file:
    pickle.dump(clf, file)

Testing accuracy Accuracy: 0.9830357142857142
