In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import xgboost as xgb
import lightgbm as lgb
import joblib
import Configurations as CF

In [11]:
# CELL 2: Data Loading & Extracting Primary Target
df = pd.read_csv(CF.FORMATIONS_INFO_DB)

# 1. Fill missing categorical values
df['Structure'] = df['Structure'].fillna('Unknown')
df['Shape'] = df['Shape'].fillna('Unknown')
df['Mode'] = df['Mode'].fillna('Unknown')
df['Counter Formations'] = df['Counter Formations'].fillna('Unknown')

# 2. THE FIX: Extract only the Primary Counter Formation (First item before '/')
# e.g., "4-5-1 / 4-1-3-2" becomes just "4-5-1"
df['Counter Formations'] = df['Counter Formations'].apply(lambda x: str(x).split('/')[0].strip())

print(f"Dataset Size: {df.shape[0]} rows")

# 3. Define Features (X) and Target (Y)
X = df[['Structure', 'Shape', 'Mode']]
y = df['Counter Formations']

# 4. Label Encode Target (Required for XGBoost/LightGBM)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 5. One-Hot Encode Inputs
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['Structure', 'Shape', 'Mode'])
    ])

# 6. Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Dictionary to store results for the final comparison table
results_summary = []

Dataset Size: 80 rows
Training set size: 64
Testing set size: 16


In [12]:
# CELL 3: Feature Setup and Encoding

# 1. Prevent Cross-Validation Crashes: 
# We must drop classes that only appear exactly 1 time in the whole dataset, 
# otherwise Stratified K-Fold will crash during Hyperparameter tuning.
class_counts = df['Counter Formations'].value_counts()
valid_classes = class_counts[class_counts > 1].index
df = df[df['Counter Formations'].isin(valid_classes)]
print(f"Dataset size after removing single-instance classes: {df.shape[0]} rows")

# 2. Define X and Y
X = df[['Structure', 'Shape', 'Mode']]
y = df['Counter Formations']

# 3. Label Encode Target (Required for XGBoost/LightGBM)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 4. One-Hot Encode Inputs
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['Structure', 'Shape', 'Mode'])
    ])

# 5. Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Dataset size after removing single-instance classes: 80 rows
Training set size: 64
Testing set size: 16


DECISION TREE:

In [13]:
# CELL 3: Decision Tree
print("--- Tuning Decision Tree ---")

dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

dt_params = {
    'classifier__max_depth': [None, 5, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__criterion': ['gini', 'entropy']
}

dt_grid = GridSearchCV(dt_pipeline, param_grid=dt_params, cv=3, scoring='accuracy')
dt_grid.fit(X_train, y_train)

dt_best = dt_grid.best_estimator_
dt_pred = dt_best.predict(X_test)

dt_acc = accuracy_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred, average='weighted')

results_summary.append({"Model": "Decision Tree", "Accuracy": dt_acc, "F1 Score": dt_f1})

print(f"Best Params: {dt_grid.best_params_}")
print(f"Accuracy: {dt_acc:.4f} | F1 Score: {dt_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))

--- Tuning Decision Tree ---
Best Params: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_split': 2}
Accuracy: 0.9375 | F1 Score: 0.9107
Confusion Matrix:
 [[7 0 0 0 0 0]
 [0 2 0 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 2 0 0]
 [0 0 0 0 3 0]
 [0 0 0 0 0 1]]




RANDOM FOREST

In [14]:
# CELL 4: Random Forest
print("--- Tuning Random Forest ---")

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

rf_params = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

rf_grid = GridSearchCV(rf_pipeline, param_grid=rf_params, cv=3, scoring='accuracy')
rf_grid.fit(X_train, y_train)

rf_best = rf_grid.best_estimator_
rf_pred = rf_best.predict(X_test)

rf_acc = accuracy_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred, average='weighted')

results_summary.append({"Model": "Random Forest", "Accuracy": rf_acc, "F1 Score": rf_f1})

print(f"Best Params: {rf_grid.best_params_}")
print(f"Accuracy: {rf_acc:.4f} | F1 Score: {rf_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))

--- Tuning Random Forest ---




Best Params: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Accuracy: 0.8750 | F1 Score: 0.8490
Confusion Matrix:
 [[7 0 0 0 0 0]
 [0 2 0 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 1 1 0]
 [0 0 0 0 3 0]
 [0 0 0 0 0 1]]


XGBOOST

In [15]:
# CELL 5: XGBoost
print("--- Tuning XGBoost ---")

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'))
])

xgb_params = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [3, 5, 10],
    'classifier__learning_rate': [0.01, 0.1, 0.2]
}

xgb_grid = GridSearchCV(xgb_pipeline, param_grid=xgb_params, cv=3, scoring='accuracy')
xgb_grid.fit(X_train, y_train)

xgb_best = xgb_grid.best_estimator_
xgb_pred = xgb_best.predict(X_test)

xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred, average='weighted')

results_summary.append({"Model": "XGBoost", "Accuracy": xgb_acc, "F1 Score": xgb_f1})

print(f"Best Params: {xgb_grid.best_params_}")
print(f"Accuracy: {xgb_acc:.4f} | F1 Score: {xgb_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_pred))

--- Tuning XGBoost ---


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Params: {'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 150}
Accuracy: 0.7500 | F1 Score: 0.7500
Confusion Matrix:
 [[7 0 0 0 0 0 0]
 [0 2 0 0 0 0 0]
 [0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 1 1 0]
 [0 0 0 1 1 1 0]
 [0 0 0 0 0 0 1]]


LIGHTGBM

In [16]:
# CELL 6: LightGBM
print("--- Tuning LightGBM ---")

lgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(random_state=42, verbose=-1))
])

lgb_params = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [3, 5, 10],
    'classifier__learning_rate': [0.01, 0.1]
}

lgb_grid = GridSearchCV(lgb_pipeline, param_grid=lgb_params, cv=3, scoring='accuracy')
lgb_grid.fit(X_train, y_train)

lgb_best = lgb_grid.best_estimator_
lgb_pred = lgb_best.predict(X_test)

lgb_acc = accuracy_score(y_test, lgb_pred)
lgb_f1 = f1_score(y_test, lgb_pred, average='weighted')

results_summary.append({"Model": "LightGBM", "Accuracy": lgb_acc, "F1 Score": lgb_f1})

print(f"Best Params: {lgb_grid.best_params_}")
print(f"Accuracy: {lgb_acc:.4f} | F1 Score: {lgb_f1:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, lgb_pred))

--- Tuning LightGBM ---




Best Params: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 50}
Accuracy: 0.5000 | F1 Score: 0.4317
Confusion Matrix:
 [[6 0 0 0 1 0]
 [1 0 0 1 0 0]
 [0 0 0 0 1 0]
 [0 0 0 1 1 0]
 [1 0 0 1 1 0]
 [1 0 0 0 0 0]]


In [17]:
# CELL 7: Final Comparison Table & Export
print("\n=== FINAL RESULTS COMPARISON ===")
df_results = pd.DataFrame(results_summary).sort_values(by="Accuracy", ascending=False).reset_index(drop=True)
display(df_results)




=== FINAL RESULTS COMPARISON ===


Unnamed: 0,Model,Accuracy,F1 Score
0,Decision Tree,0.9375,0.910714
1,Random Forest,0.875,0.848958
2,XGBoost,0.75,0.75
3,LightGBM,0.5,0.431696


In [18]:
# Explicitly select the tuned Random Forest model
# Why? Because ensemble methods are much more robust to live, unseen data than a single Decision Tree.
ultimate_model = rf_best
selected_model_name = "Random Forest"

# Save the winning model
joblib.dump(ultimate_model, CF.COUNTER_FORMATION_PREDICTOR_PATH)
joblib.dump(label_encoder, CF.COUNTER_FORMATION_TARGET_LABEL_ENCODER_PATH)

print("="*40)
print(f"üèÜ SELECTED MODEL: {selected_model_name}")
print("="*40)
print(f"Saved successfully as '{CF.COUNTER_FORMATION_PREDICTOR_PATH}'.")
print(f"Saved target label encoder as '{CF.COUNTER_FORMATION_TARGET_LABEL_ENCODER_PATH}'")
print(f"Your model is now ready to be plugged into FormationDetector.py!")

üèÜ SELECTED MODEL: Random Forest
Saved successfully as 'Models/counter_form_predictor.pkl'.
Saved target label encoder as 'Models/counter_form_target_label_encoder.pkl'
Your model is now ready to be plugged into FormationDetector.py!
