In [207]:
import pandas as pd
import numpy as np

# Load and prepare data

In [208]:
data = pd.read_csv('ufc-master.csv')
df = data.copy()


In [209]:
keep_columns = ['RedFighter', 'Date', 'Winner', 'WeightClass', 'Gender', 'RedAvgSigStrLanded','RedAvgSigStrPct','RedAvgSubAtt','RedAvgTDLanded','RedAvgTDPct','RedStance','RedHeightCms','RedReachCms','RedWeightLbs','RedAge']
df = df[keep_columns]

df = df.sort_values('Date', ascending = False)
df = df.reset_index(drop = True)

# Create Target Variable

In [210]:
df['Winner'].replace('Red', True, inplace = True)
df['Winner'].replace('Blue', False, inplace = True)

def create_target(fighter):
    fighter['target'] = fighter['Winner'].shift(-1)
    return fighter 

df = df.groupby('RedFighter', group_keys=False).apply(create_target)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Winner'].replace('Red', True, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Winner'].replace('Blue', False, inplace = True)
  df['Winner'].replace('Blue', False, inplace = True)
  df = df.groupby('RedFighter', group_keys=False).apply(create_target)


In [211]:

df['RedFighter']

0       Alexandre Pantoja
1             Randy Brown
2       Shavkat Rakhmonov
3       Kennedy Nzechukwu
4              Clay Guida
              ...        
6523         Duane Ludwig
6524          John Howard
6525       Brendan Schaub
6526          Mike Pierce
6527         Eric Schafer
Name: RedFighter, Length: 6528, dtype: object

In [212]:
df['target'][pd.isnull(df['target'])] = 2
df['target'] = df['target'].astype(int)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['target'][pd.isnull(df['target'])] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'][pd.isnul

In [213]:
df['Winner'].value_counts()
df['target'].value_counts()

target
1    3187
0    1680
2    1661
Name: count, dtype: int64

In [222]:
df.dropna(inplace = True)

In [223]:
df

Unnamed: 0,RedFighter,Date,Winner,WeightClass,Gender,RedAvgSigStrLanded,RedAvgSigStrPct,RedAvgSubAtt,RedAvgTDLanded,RedAvgTDPct,RedStance,RedHeightCms,RedReachCms,RedWeightLbs,RedAge,target
0,Alexandre Pantoja,2024-12-07,True,Flyweight,MALE,0.031277,0.490,0.095238,0.208800,0.470,Orthodox,0.217391,0.339623,0.066667,0.551724,0.0
1,Randy Brown,2024-12-07,False,Welterweight,MALE,0.031064,0.470,0.071429,0.057600,0.390,Orthodox,0.652174,0.754717,0.366667,0.551724,0.0
2,Shavkat Rakhmonov,2024-12-07,True,Welterweight,MALE,0.029220,0.610,0.214286,0.119200,0.290,Orthodox,0.565217,0.716981,0.366667,0.413793,0.0
3,Kennedy Nzechukwu,2024-12-07,True,Heavyweight,MALE,0.037872,0.470,0.023810,0.046400,0.450,Southpaw,0.739130,0.943396,0.600000,0.482759,0.0
4,Clay Guida,2024-12-07,False,Lightweight,MALE,0.018723,0.330,0.071429,0.250400,0.360,Orthodox,0.304348,0.452830,0.266667,0.827586,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6523,Duane Ludwig,2010-03-21,False,Lightweight,MALE,0.096927,0.577,0.000000,0.000000,0.000,Orthodox,0.434783,0.452830,0.366667,0.448276,1.0
6524,John Howard,2010-03-21,True,Welterweight,MALE,0.127660,0.550,0.119048,0.373336,0.790,Orthodox,0.304348,0.490566,0.366667,0.310345,1.0
6525,Brendan Schaub,2010-03-21,True,Heavyweight,MALE,0.085106,0.250,0.000000,0.000000,0.000,Orthodox,0.695652,0.754717,0.866667,0.310345,1.0
6526,Mike Pierce,2010-03-21,True,Welterweight,MALE,0.287234,0.405,0.000000,0.280000,0.520,Orthodox,0.347826,0.452830,0.366667,0.379310,1.0


# Get Features for Model

In [224]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

In [225]:
rr = RidgeClassifier(alpha = 1)
split = TimeSeriesSplit(n_splits = 3)
sfs = SequentialFeatureSelector(rr, n_features_to_select = 8, direction = 'forward', cv = split)  

In [226]:
remove = []
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype == 'str' or df[column].dtype == 'bool':
        remove.append(column)
print(remove)

selected_columns = df.columns[~df.columns.isin(remove)]

['RedFighter', 'Date', 'Winner', 'WeightClass', 'Gender', 'RedStance']


In [227]:
scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns]) 
df[selected_columns].head()

Unnamed: 0,RedAvgSigStrLanded,RedAvgSigStrPct,RedAvgSubAtt,RedAvgTDLanded,RedAvgTDPct,RedHeightCms,RedReachCms,RedWeightLbs,RedAge,target
0,0.031277,0.49,0.095238,0.2088,0.47,0.217391,0.346154,0.066667,0.551724,0.0
1,0.031064,0.47,0.071429,0.0576,0.39,0.652174,0.769231,0.366667,0.551724,0.0
2,0.02922,0.61,0.214286,0.1192,0.29,0.565217,0.730769,0.366667,0.413793,0.0
3,0.037872,0.47,0.02381,0.0464,0.45,0.73913,0.961538,0.6,0.482759,0.0
4,0.018723,0.33,0.071429,0.2504,0.36,0.304348,0.461538,0.266667,0.827586,0.0


In [228]:
df

Unnamed: 0,RedFighter,Date,Winner,WeightClass,Gender,RedAvgSigStrLanded,RedAvgSigStrPct,RedAvgSubAtt,RedAvgTDLanded,RedAvgTDPct,RedStance,RedHeightCms,RedReachCms,RedWeightLbs,RedAge,target
0,Alexandre Pantoja,2024-12-07,True,Flyweight,MALE,0.031277,0.490,0.095238,0.208800,0.470,Orthodox,0.217391,0.346154,0.066667,0.551724,0.0
1,Randy Brown,2024-12-07,False,Welterweight,MALE,0.031064,0.470,0.071429,0.057600,0.390,Orthodox,0.652174,0.769231,0.366667,0.551724,0.0
2,Shavkat Rakhmonov,2024-12-07,True,Welterweight,MALE,0.029220,0.610,0.214286,0.119200,0.290,Orthodox,0.565217,0.730769,0.366667,0.413793,0.0
3,Kennedy Nzechukwu,2024-12-07,True,Heavyweight,MALE,0.037872,0.470,0.023810,0.046400,0.450,Southpaw,0.739130,0.961538,0.600000,0.482759,0.0
4,Clay Guida,2024-12-07,False,Lightweight,MALE,0.018723,0.330,0.071429,0.250400,0.360,Orthodox,0.304348,0.461538,0.266667,0.827586,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6523,Duane Ludwig,2010-03-21,False,Lightweight,MALE,0.096927,0.577,0.000000,0.000000,0.000,Orthodox,0.434783,0.461538,0.366667,0.448276,1.0
6524,John Howard,2010-03-21,True,Welterweight,MALE,0.127660,0.550,0.119048,0.373336,0.790,Orthodox,0.304348,0.500000,0.366667,0.310345,1.0
6525,Brendan Schaub,2010-03-21,True,Heavyweight,MALE,0.085106,0.250,0.000000,0.000000,0.000,Orthodox,0.695652,0.769231,0.866667,0.310345,1.0
6526,Mike Pierce,2010-03-21,True,Welterweight,MALE,0.287234,0.405,0.000000,0.280000,0.520,Orthodox,0.347826,0.461538,0.366667,0.379310,1.0


In [229]:
df['target'] = df['target'].round()
sfs.fit(df[selected_columns], df['target'])

In [230]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
predictors.remove('target')
predictors

['RedAvgSigStrLanded',
 'RedAvgSigStrPct',
 'RedAvgSubAtt',
 'RedAvgTDLanded',
 'RedAvgTDPct',
 'RedHeightCms',
 'RedReachCms']

# Initialize Baseline Model

In [None]:
def backtest(data, model, predictors, train_window_months = 24, test_window_months = 6):
    results = []
    
    data = data.copy()
    data['Date'] = pd.to_datetime(data['Date'])

    #generate test periods
    start_date = data['Date'].min()
    end_date = data['Date'].max()
    print(f"\nFull date range: {start_date} to {end_date}")
    
    test_starts = pd.date_range(start=pd.to_datetime(start_date) + pd.DateOffset(months=train_window_months),end=pd.to_datetime(end_date) - pd.DateOffset(months=test_window_months),freq='M')
    #iterate through each test period
    for test_start in test_starts:
        train_start = test_start - pd.DateOffset(months=train_window_months)
        test_end = test_start + pd.DateOffset(months=test_window_months)
        
        #split into training and testing data based on dates
        train_data = data[(data['Date'] >= train_start) & (data['Date'] < test_start)]
        test_data = data[(data['Date'] >= test_start) & (data['Date'] < test_end)]
        
        #skip if not enough data
        if len(train_data) < 1 or len(test_data) < 1:
            print(f"Skipping window due to insufficient data")
            continue

        #split into predictors and target
        X_train = train_data[predictors]
        y_train = train_data['target']
        X_test = test_data[predictors]
        y_test = test_data['target']

        #train and predict
        model.fit(X_train, y_train)

        preds = model.predict(X_test)
        preds = pd.Series(preds, index = y_test.index)

        #compare predictions to actual results
        combined = pd.concat([y_test, preds], axis = 1)
        combined.columns = ['actual', 'predicted']
        results.append(combined)
    
    return pd.concat(results)


In [246]:
predictions = backtest(df, rr, predictors)
print(predictors)
print(df['target'].value_counts())
predictions


Full date range: 2010-03-21 00:00:00 to 2024-12-07 00:00:00


  test_starts = pd.date_range(start=pd.to_datetime(start_date) + pd.DateOffset(months=train_window_months),end=pd.to_datetime(end_date) - pd.DateOffset(months=test_window_months),freq='M')


['RedAvgSigStrLanded', 'RedAvgSigStrPct', 'RedAvgSubAtt', 'RedAvgTDLanded', 'RedAvgTDPct', 'RedHeightCms', 'RedReachCms']
target
0.0    4793
1.0    1280
Name: count, dtype: int64


Unnamed: 0,actual,predicted
5775,1.0,1.0
5776,0.0,0.0
5778,0.0,0.0
5779,0.0,0.0
5780,0.0,0.0
...,...,...
285,0.0,0.0
286,0.0,0.0
287,0.0,0.0
288,0.0,0.0


In [247]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions['actual'], predictions['predicted'])

0.8078082745329644

In [305]:
df.groupby('RedStance').apply(lambda x: x[x['target'] == 1].shape[0] / x.shape[0])


  df.groupby('RedStance').apply(lambda x: x[x['target'] == 1].shape[0] / x.shape[0])


RedStance
Open Stance    0.500000
Orthodox       0.210421
Southpaw       0.194376
Switch         0.268571
dtype: float64