In [None]:
import pandas as pd 
import os
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import shap 
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


from sklearn.preprocessing import LabelEncoder

# Pretreat data - data is loaded as csv files from individual class measurements 

os.chdir() #Give file location here

df_gf = pd.read_csv('Measurements_Greenflag.csv') 
df_gr = pd.read_csv('Measurements_Greenround.csv')
df_rf = pd.read_csv('Measurements_Redflag.csv')
df_rr = pd.read_csv('Measurements_Redround.csv')
df_ot = pd.read_csv('Measurements_Others.csv')

# Add labels to each dataframe 

df_gf['Label']= 'Greenflag'
df_gr['Label']= 'Greenround'
df_rf['Label']= 'Redflag'
df_rr['Label']= 'Redround'
df_ot['Label']= 'Others'





In [2]:
# Combine to one data set and order them randomly to prevent order bias

df = pd.concat([df_gf,df_gr,df_rf,df_rr,df_ot])

df.head()

In [4]:

rand_df = df.sample(frac=1).reset_index(drop = True) #Shuffle rows to prevent bias

#Format data to have one dataset of what we want to predict and one of what should be used to predict 
X = rand_df.drop('Label', axis =1).copy()


In [None]:
y = rand_df['Label'].copy()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)



In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, stratify = y) # Splits dataset in training and validation 

## Run XGBoost

In [None]:
clf_xgb = xgb.XGBClassifier(objective = 'multi:softmax',
                            early_stopping_rounds = 20, 
                            eval_metric ='mlogloss') 
clf_xgb.fit(X_train,
            y_train,
            verbose = True,
            eval_set=[(X_test, y_test)])
clf_xgb.save_model('V26_Classifiermodel.json')

In [None]:
 ## Parameter optimization 

 #Round 1
 param_grid = {
     'max_depth': [3,4,5],
     'learning_rate':[0.1,0.5,1],
     'gamma': [0,0.05,0.1],
     'reg_lambda':[0,1.0,10.0],
 }
 ##Output: Best Parameters: {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'reg_lambda': 0, 'scale_pos_weight': 1}

 #Round 2
 param_grid = {
    'max_depth': [4],
    'learning_rate':[0.13,0.14,0.15],
    'gamma': [0.12],
    'reg_lambda':[0.01],
 }
 ##Output: Best Parameters: {'gamma': 0.12, 'learning_rate': 0.13, 'max_depth': 4, 'reg_lambda': 0.01}


 optimal_params=GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax',
                                                         num_class=5,
                                                         ),
                                                         param_grid = param_grid,
                                                         scoring='roc_auc_ovr', 
                                                         verbose=0, 
                                                         n_jobs=10,
                                                         cv=3
 )

In [None]:
 #Optimize parameters 
 optimal_params.fit(X_train,
                    y_train,
                    eval_set=[(X_test, y_test)],
                    verbose = False)
 print("Best Parameters:", optimal_params.best_params_)

## Build XGBoost model again with optimised parameters

In [None]:
clf_xgb = xgb.XGBClassifier(objective = 'multi:softmax',
                          gamma=0.12,
                          learning_rate=0.13,
                          max_depth=4,
                            reg_lambda=0.01,
                            early_stopping_rounds = 30, 
                            eval_metric =['mlogloss','merror'],
                            
                            num_class =5
                           ) 
clf_xgb.fit(X_train,
            y_train,
            verbose = True,
            eval_set=[(X_test, y_test)])

In [None]:
# Get Overview of Hyperparameters
params = clf_xgb.get_params()
print("Learning rate:", params.get('learning_rate'))
print("Max depth:", params.get('max_depth'))
print("Gamma:", params.get('gamma'))  # Added line for gamma
print("Reg lambda:", params.get('reg_lambda'))


if hasattr(clf_xgb, 'best_iteration'):
    num_trees = clf_xgb.best_iteration + 1
    print(f"The model contains {num_trees} trees.")



if hasattr(clf_xgb, 'evals_result'):
    evals_result = clf_xgb.evals_result()
    if 'merror' in evals_result['validation_0']:
        last_error_rate = evals_result['validation_0']['merror'][-1]
        training_accuracy = 1 - last_error_rate
        print(f"Training accuracy: {training_accuracy:.4f}")


In [None]:
#Visualize validation (Confusion matrix)
predictions = clf_xgb.predict(X_test)

cm = confusion_matrix(y_test, predictions)

plt.rcParams.update({'font.size': 14})

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Greenflag', 'Greenround', 'Others', 'Redflag', 'Redround'])
fig, ax = plt.subplots(figsize=(10, 8)) 
disp.plot(values_format='.1f', cmap='viridis', ax=ax)

ax.set_yticklabels(['Greenflag', 'Greenround', 'Others', 'Redflag', 'Redround'], rotation=90, verticalalignment='center')

ax.set_xlabel('Predicted Label', labelpad=10)


# Save the plot to a file
plt.savefig('Tmodel_confusion_matrix.png', bbox_inches='tight')


## Make a Tree (for illustration)

In [None]:
clf_xgb = xgb.XGBClassifier(objective = 'multi:softmax',
                           gamma=0,
                           learning_rate=0.1,
                           max_depth=4,
                           reg_lambda=10,
                           n_estimators=1) 
clf_xgb.fit(X_train,
            y_train)

bst = clf_xgb.get_booster()
for importance_type in ('weight','gain','cover','total_gain','total_cover'): 
    print('%s: ' % importance_type, bst.get_score(importance_type=importance_type))

node_params={'shape':'box',
             'style':'filled, rounded',
             'fillcolor':'#78cbe'}
leaf_params={'shape':'box', 
             'style':'filled', 
             'fillcolor':'#ab2c2c'}

graph_data = xgb.to_graphviz(clf_xgb, num_trees=0, size="10,10",
                condition_node_params=node_params,
                leaf_node_params=leaf_params)

graph_data.view(filename='xgboost_tree_500_imageanalysis') 

## Check influence of parameters

In [None]:
ax = xgb.plot_importance(clf_xgb, max_num_features=34, importance_type='weight')
plt.show()

# Save the plot to a file
ax.figure.savefig('importance.png',bbox_inches='tight')  



importance = clf_xgb.get_booster().get_score(importance_type='gain')
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_importance: 
    print(f"Feature: {feature}, Importance: {importance}")
  

## SHAP analysis

In [10]:
# Ensure X_test is a DataFrame with the correct number of feature columns
#feature_names = ['Area','Mean','StdDev','Mode','Min','Max','X','Y','XM','YM','Perim.','BX','BY','Width','Height','Major','Minor','Angle','Circ.','Feret','IntDen','Median','Skew','Kurt','%Area','RawIntDen','FeretX','FeretY','FeretAngle','MinFeret','AR','Round','Solidity']  # Replace with actual feature names


# Explain predictions with SHAP values 
explainer = shap.TreeExplainer(clf_xgb, feature_perturbation='interventional') 
shap_values = explainer(X_test, check_additivity=False) 

num_classes = 5

#Array with correct predictions 
a = ([0,8,1,3,2])

#Array with class names
names=(['Greenflag','Greenround','Others','Redflag','Redround'])

# Get model's prediction for the class
for j in range(len(a)):
    for i in  range(num_classes): 
        model_prediction = clf_xgb.predict_proba(X_test)[a[j],i]
        plt.figure()
        shap.plots.waterfall(shap_values[a[j],:,i], show = False, max_display = 10)
        plt.title(f"Class: {names[i]} - Model Prediction: {model_prediction:.2f}")
        plt.savefig(f"Waterfall {i,j}.png",bbox_inches='tight')
        plt.close()
        


    

