In [6]:
import pandas as pd 

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

# Load datasets
fighter_stats = pd.read_csv("fighter_stats_cleaned.csv")
fights = pd.read_csv("fights_cleaned.csv") 

In [7]:
mod = LinearRegression()

In [8]:
fighter_features = [
    'wins', 'losses', 'draws', 'height', 'weight', 'reach', 'stance',
    'strikes_landed_per_min', 'striking_accuracy', 'strikes_absorbed_per_min',
    'striking_defense', 'takedown_avg', 'takedown_accuracy', 
    'takedown_defense', 'submission_avg'
]

def get_fighter_features(fighter_stats_data, fighter_id, prefix):
    # Search for the fighter in the dataset 
    fighter_row = fighter_stats_data[fighter_stats_data['fighter_id'] == fighter_id]
    if len(fighter_row) == 0:
        return None 
    
    # Get the fighter's features
    stats = {} 
    for feature in fighter_features: 
        # Add the feature to the stats dictionary with fighter prefix
        stats[f'{prefix}_{feature}'] = fighter_row[feature].values[0]
    return stats


In [None]:
X_data = []
y_data = []

for idx, fight in fights.iterrows():

    fighter1_id = fight['fighter1_id']
    fighter2_id = fight['fighter2_id']

    winner_id = fight['winner_id']

    # Skip if winner_id is NaN (no winner) 
    if pd.isna(winner_id):
        continue

    # Get individual fighter features for f1 & f2
    f1_stats = get_fighter_features(fighter_stats, fighter1_id, 'f1')
    f2_stats = get_fighter_features(fighter_stats, fighter2_id, 'f2')

    # Skip if either fighter is missing stats
    if f1_stats is None or f2_stats is None:
        continue

    # Combine features for f1 & f2
    fight_features = {**f1_stats, **f2_stats}

    for feature in fighter_features:
        f1_val = fight_features[f'f1_{feature}']
        f2_val = fight_features[f'f2_{feature}']

        # Calculate difference between f1 & f2 features
        if pd.isna(f1_val) or pd.isna(f2_val):
            fight_features[f'diff_{feature}'] = 0 
        else:
            fight_features[f'diff_{feature}'] = f1_val - f2_val

    # Target 1 if fighter1 wins, 0 if fighter2 wins
    target = 1 if winner_id == fighter1_id else 0

    # Contains features for both fighters (f1 & f2 'fighter_features', difference in features)
    X_data.append(fight_features)

    # Contains which fighter won the fighter (1 if fighter1 wins, 0 if fighter2 wins)
    y_data.append(target)

# Convert to DataFrames
X = pd.DataFrame(X_data)
y = pd.DataFrame(y_data)


In [10]:
print(X_data.head())

   f1_wins  f1_losses  f1_draws  f1_height  f1_weight  f1_reach  f1_stance  \
0       20          6         0       68.0      155.0      71.0        0.0   
1        7          1         0       74.0      265.0      72.0        1.0   
2        7          1         0       74.0      265.0      72.0        1.0   
3        7          1         0       74.0      265.0      72.0        1.0   
4       20          8         0       75.0      235.0      76.0        0.0   

   f1_strikes_landed_per_min  f1_striking_accuracy  \
0                       5.51                    54   
1                       4.27                    55   
2                       4.27                    55   
3                       4.27                    55   
4                       2.41                    44   

   f1_strikes_absorbed_per_min  ...  diff_reach  diff_stance  \
0                         3.71  ...         5.0          0.0   
1                         3.67  ...        -3.0          1.0   
2             

In [13]:
# Handle missing values
X = X.fillna(X.median())

# Split the data nSplitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

NameError: name 'X' is not defined

In [None]:
# Train Random Forest model
print("Training Random Forest model...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20, # 20 original value       
    min_samples_split=10, # 10 original value
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(X_train_scaled, y_train)
print("Model training complete!")

In [None]:
# Make predictions
print("Making predictions...")
y_pred_train = rf_model.predict(X_train_scaled)
y_pred_test = rf_model.predict(X_test_scaled)

# Calculate probabilities
y_pred_proba = rf_model.predict_proba(X_test_scaled)

In [None]:
# Evaluate the model

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test,target_names=['Fighter 2 Wins', 'Fighter 1 Wins']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_test)
print("\nConfusion Matrix:")
print(cm)

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
display(feature_importance.head(20))

In [None]:
# Feature Importance Plot
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances - Random Forest Model')
plt.tight_layout()
plt.show()