In [1]:
import pandas as pd

In [2]:
pip install ufc_api

Note: you may need to restart the kernel to use updated packages.


In [3]:
from ufc import get_fighter
from ufc import get_event

In [4]:
def flatten_event_data(event_data):
    flattened_data = []

    for fight in event_data['fights']:
        fight_data = {
            'event_name': event_data['name'],
            'event_date': event_data['date'],
            'event_location': event_data['location'],
            'event_venue': event_data['venue'],
            'weightclass': fight['weightclass'],
            'red_corner_name': fight['red corner']['name'],
            'red_corner_ranking': fight['red corner']['ranking'],
            'red_corner_odds': fight['red corner']['odds'],
            'red_corner_link': fight['red corner']['link'],
            'red_corner_result': fight['red corner']['result'],
            'blue_corner_name': fight['blue corner']['name'],
            'blue_corner_ranking': fight['blue corner']['ranking'],
            'blue_corner_odds': fight['blue corner']['odds'],
            'blue_corner_link': fight['blue corner']['link'],
            'blue_corner_result': fight['blue corner']['result'],
            'round': fight['round'],
            'time': fight['time'],
            'method': fight['method']
        }
        flattened_data.append(fight_data)

    return pd.DataFrame(flattened_data)

In [5]:
round_by_round = pd.read_csv('ufc_fight_stats.csv')

In [6]:
results = pd.read_csv('ufc_fight_results.csv')

In [7]:
event_details = pd.read_csv('ufc_event_details.csv')

In [8]:
fighters = pd.read_csv('ufc_fighter_tott.csv')

In [9]:
fighters['FIGHTER'] = fighters['FIGHTER'].str.lower()
fighters['FIGHTER'] = fighters['FIGHTER'].str.replace('-', '')

In [10]:
name_list = fighters['FIGHTER'].unique().tolist()

In [11]:
import csv
from collections import defaultdict
import pandas as pd

def parse_fight_data(data, header):
    fight_data = defaultdict(lambda: defaultdict(dict))

    for row in data:
        event, bout, round, fighter, *stats = row
        bout_parts = bout.split(' vs. ')
        if bout_parts[0] == fighter:
            opponent = bout_parts[1]
        else:
            opponent = bout_parts[0]
        fighter_data = fight_data[bout][fighter]

        for stat, value in zip(header[4:], stats):
            if ' of ' in value:
                success, attempt = value.split(' of ')
                fighter_data[f"{round}_{stat}_Success"] = success
                fighter_data[f"{round}_{stat}_Attempt"] = attempt
            else:
                fighter_data[f"{round}_{stat}"] = value

        fighter_data['OPPONENT'] = opponent

    return fight_data

# Read the data from a CSV file
with open('ufc_fight_stats.csv', 'r') as file:
    reader = csv.reader(file)
    header = next(reader)  # Get the header row
    data = list(reader)  # Get the data rows

print(header[4:])

fight_data = parse_fight_data(data, header)

# Create a list of rows for the DataFrame
rows = []
for bout, fighters in fight_data.items():
    for fighter, stats in fighters.items():
        row = {'BOUT': bout, 'FIGHTER': fighter}
        row.update(stats)
        rows.append(row)

# Create the DataFrame
df = pd.DataFrame(rows)

['KD', 'SIG.STR.', 'SIG.STR. %', 'TOTAL STR.', 'TD', 'TD %', 'SUB.ATT', 'REV.', 'CTRL', 'HEAD', 'BODY', 'LEG', 'DISTANCE', 'CLINCH', 'GROUND']


In [12]:
df.to_csv('test.csv')

In [13]:
import pandas as pd

# Assuming you have a DataFrame 'results' with the 'OUTCOME' column
results2 = results['OUTCOME'].str.split('/', expand=True).stack().reset_index(level=1, drop=True)

print(results2)

resultsWL = pd.DataFrame(results2)

0       L
0       W
1       L
1       W
2       L
       ..
7807    L
7808    W
7808    L
7809    W
7809    L
Length: 15620, dtype: object


In [14]:
resultsWL.rename(columns={resultsWL.columns[0]: 'WL'}, inplace=True)

In [15]:
resultsWL['WL'] = resultsWL['WL'].replace({'W': 1, 'NC': 0, 'D': 0, 'L': -1})

In [16]:
df = df.iloc[::-1].reset_index(drop=True)

In [17]:
# Assuming your DataFrame is named 'df'
columns_to_drop = ['_KD', '_SIG.STR.', '_SIG.STR. %', '_TOTAL STR.', '_TD', '_TD %', '_SUB.ATT', '_REV.', '_CTRL', '_HEAD', '_BODY', '_LEG', '_DISTANCE', '_CLINCH', '_GROUND']

df = df.drop(columns_to_drop, axis=1)

In [18]:
df.dtypes

BOUT                        object
FIGHTER                     object
Round 1_KD                  object
Round 1_SIG.STR._Success    object
Round 1_SIG.STR._Attempt    object
                             ...  
Round 5_DISTANCE_Attempt    object
Round 5_CLINCH_Success      object
Round 5_CLINCH_Attempt      object
Round 5_GROUND_Success      object
Round 5_GROUND_Attempt      object
Length: 123, dtype: object

In [19]:
df = df.dropna(subset=['Round 1_KD'])

In [20]:
df = df.fillna('-1')

In [21]:
columns_non_int = ['BOUT', 'FIGHTER', 'OPPONENT', 'Round 1_SIG.STR. %', 'Round 2_SIG.STR. %', 'Round 3_SIG.STR. %', 'Round 4_SIG.STR. %', 'Round 5_SIG.STR. %', 'Round 1_TD %'
                  , 'Round 2_TD %', 'Round 3_TD %', 'Round 4_TD %', 'Round 5_TD %', 'Round 1_CTRL', 'Round 2_CTRL', 'Round 3_CTRL', 'Round 4_CTRL', 'Round 5_CTRL']

# Function to remove '.0' from strings
def remove_decimal_zero(value):
    if isinstance(value, str) and value.endswith('.0'):
        return value.rstrip('.0')
    else:
        return value

for col in df.columns:
    if col not in columns_non_int:
        df[col] = df[col].apply(remove_decimal_zero)

df = df.replace(r'^\s*$', 0, regex=True)

for col in df.columns:
    if col not in columns_non_int:
        df[col] = df[col].astype(int)


In [22]:
df.to_csv('test.csv')

In [23]:
time_cols = ['Round 1_CTRL', 'Round 2_CTRL', 'Round 3_CTRL', 'Round 4_CTRL', 'Round 5_CTRL']

def convert_to_seconds(time_string):
  if pd.isna(time_string):  # Check for missing values (NaN)
    return 0
  if time_string == '--':
    return 0
  try:
    # Attempt to split, handle potential ValueError
    minutes, seconds = map(int, time_string.split(':'))
  except ValueError:
    # If there's no colon, return -1 for both minutes and seconds
    minutes, seconds = 0, 0
  return minutes * 60 + seconds


for cols in time_cols:
    df[cols] = df[cols].astype(str)
    df[cols] = df[cols].apply(convert_to_seconds)


In [24]:

sig_strikes =  ['Round 1_SIG.STR. %', 'Round 2_SIG.STR. %', 'Round 3_SIG.STR. %', 'Round 4_SIG.STR. %', 'Round 5_SIG.STR. %']

str_suc = ['Round 1_SIG.STR._Success', 'Round 2_SIG.STR._Success', 'Round 3_SIG.STR._Success', 'Round 4_SIG.STR._Success', 'Round 5_SIG.STR._Success']

df['Round 1_SIG.STR. %'] = df['Round 1_SIG.STR._Success'] / df['Round 1_SIG.STR._Attempt']
df['Round 2_SIG.STR. %'] = df['Round 2_SIG.STR._Success'] / df['Round 2_SIG.STR._Attempt']
df['Round 3_SIG.STR. %'] = df['Round 3_SIG.STR._Success'] / df['Round 3_SIG.STR._Attempt']
df['Round 4_SIG.STR. %'] = df['Round 4_SIG.STR._Success'] / df['Round 4_SIG.STR._Attempt']
df['Round 5_SIG.STR. %'] = df['Round 5_SIG.STR._Success'] / df['Round 5_SIG.STR._Attempt']


df = df.fillna(0)

In [25]:
df['Round 1_TD %'] = df['Round 1_TD_Success'] / df['Round 1_TD_Attempt']
df['Round 2_TD %'] = df['Round 2_TD_Success'] / df['Round 2_TD_Attempt']
df['Round 3_TD %'] = df['Round 3_TD_Success'] / df['Round 3_TD_Attempt']
df['Round 4_TD %'] = df['Round 4_TD_Success'] / df['Round 4_TD_Attempt']
df['Round 5_TD %'] = df['Round 5_TD_Success'] / df['Round 5_TD_Attempt']

df = df.fillna(0)

In [26]:
# Assuming you have a DataFrame 'df' with the mentioned columns
stat_types = ['_KD', '_SIG.STR._Success', '_TOTAL STR._Success', '_SIG.STR._Attempt', '_TOTAL STR._Attempt', '_TD_Success', '_TD_Attempt', '_SUB.ATT', '_REV.', '_CTRL', '_HEAD_Success', '_HEAD_Attempt', '_BODY_Success', '_BODY_Attempt', '_LEG_Success', '_LEG_Attempt', '_DISTANCE_Success', '_DISTANCE_Attempt', '_CLINCH_Success', '_CLINCH_Attempt', '_GROUND_Success', '_GROUND_Attempt']

# Create a dictionary to store the new columns
new_columns = {}

for stat_type in stat_types:
    cols = df.filter(regex=stat_type).columns
    # Use a lambda function to exclude -1 values before summing
    new_columns['Total' + stat_type] = df[cols].apply(lambda x: x[x != -1].sum(), axis=1)

# Create a new DataFrame with all the new columns
new_df = pd.DataFrame(new_columns)

# Concatenate the new DataFrame with the original one
df = pd.concat([df, new_df], axis=1)

In [27]:
attempt_cols = ['Round 1_TD_Attempt', 'Round 2_TD_Attempt', 'Round 3_TD_Attempt', 'Round 4_TD_Attempt', 'Round 5_TD_Attempt']

# List of percentage column names
percentage_cols = ['Round 1_TD %', 'Round 2_TD %', 'Round 3_TD %', 'Round 4_TD %', 'Round 5_TD %']

# Iterate over the attempt and percentage columns
for attempt_col, percentage_col in zip(attempt_cols, percentage_cols):
    # Check if the attempt column has a value of -1
    mask = df[attempt_col] == -1
    
    # Set the corresponding percentage column to -1 if the attempt column is -1
    df.loc[mask, percentage_col] = -1

In [28]:
df['Total_SIG.STR. %'] = df['Total_SIG.STR._Success'] / df['Total_SIG.STR._Attempt']
df['Total_TD %'] = df['Total_TD_Success'] / df['Total_TD_Attempt']

df = df.fillna(0)

In [29]:
results = results.iloc[::-1]

In [30]:
df['BOUT'] = df['BOUT'].str.strip().str.lower()
results['BOUT'] = results['BOUT'].str.strip().str.lower().str.replace('  ', ' ')
df_merge = pd.merge(df, results, on='BOUT', how='left')

In [31]:
weight_class_mapping = {
    'Open Weight': ['Open Weight Bout'],
    'Heavyweight': ['Heavyweight Bout', 'UFC Heavyweight Title Bout', 'UFC 13 Heavyweight Tournament Title Bout',
                    'UFC 14 Heavyweight Tournament Title Bout', 'UFC 15 Heavyweight Tournament Title Bout',
                    'Ultimate Japan Heavyweight Tournament Title Bout', 'Ultimate Japan 2 Heavyweight Tournament Title Bout',
                    'Ultimate Fighter 2 Heavyweight Tournament Title Bout', 'Ultimate Fighter 10 Heavyweight Tournament Title Bout',
                    'Ultimate Fighter Brazil 3 Heavyweight Tournament Title Bout', 'Super Heavyweight Bout', 'UFC Interim Heavyweight Title Bout'],
    'Light Heavyweight': ['UFC Light Heavyweight Title Bout', 'Light Heavyweight Bout', 'UFC Interim Light Heavyweight Title Bout',
                          'Ultimate Fighter 3 Light Heavyweight Tournament Title Bout', 'Ultimate Fighter 1 Light Heavyweight Tournament Title Bout',
                          'Ultimate Fighter 8 Light Heavyweight Tournament Title Bout', 'Ultimate Fighter 19 Light Heavyweight Tournament Title Bout',
                          'Ultimate Fighter 23 Light Heavyweight Tournament Title Bout'],
    'Middleweight': ['Middleweight Bout', 'UFC 17 Middleweight Tournament Title Bout', 'UFC Middleweight Title Bout', 'UFC Interim Middleweight Title Bout',
                     'Ultimate Fighter 1 Middleweight Tournament Title Bout', 'Ultimate Fighter 3 Middleweight Tournament Title Bout',
                     'Ultimate Fighter 4 Middleweight Tournament Title Bout', 'Ultimate Fighter 7 Middleweight Tournament Title Bout',
                     'Ultimate Fighter 11 Middleweight Tournament Title Bout', 'Ultimate Fighter 17 Middleweight Tournament Title Bout',
                     'Ultimate Fighter Brazil 1 Middleweight Tournament Title Bout', 'Ultimate Fighter Brazil 3 Middleweight Tournament Title Bout',
                     'Ultimate Fighter 19 Middleweight Tournament Title Bout', 'TUF Nations Canada vs. Australia Middleweight Tournament Title Bout'],
    'Welterweight': ['Welterweight Bout', 'UFC Welterweight Title Bout', 'UFC Interim Welterweight Title Bout',
                     'Ultimate Fighter 2 Welterweight Tournament Title Bout', 'Ultimate Fighter 6 Welterweight Tournament Title Bout',
                     'Ultimate Fighter 4 Welterweight Tournament Title Bout', 'Ultimate Fighter 9 Welterweight Tournament Title Bout',
                     'Ultimate Fighter 13 Welterweight Tournament Title Bout', 'Ultimate Fighter 16 Welterweight Tournament Title Bout',
                     'Ultimate Fighter Brazil 2 Welterweight Tournament Title Bout', 'Ultimate Fighter China Welterweight Tournament Title Bout',
                     'TUF Nations Canada vs. Australia Welterweight Tournament Title Bout', 'Ultimate Fighter Latin America 2 Welterweight Tournament Title Bout',
                     'Ultimate Fighter 21 Welterweight Tournament Title Bout', 'Ultimate Fighter 25 Welterweight Tournament Title Bout'],
    'Lightweight': ['Lightweight Bout', 'UFC Lightweight Title Bout', 'UFC 13 Lightweight Tournament Title Bout',
                    'Ultimate Fighter 5 Lightweight Tournament Title Bout', 'Ultimate Fighter 8 Lightweight Tournament Title Bout',
                    'Ultimate Fighter 9 Lightweight Tournament Title Bout', 'Ultimate Fighter 12 Lightweight Tournament Title Bout',
                    'Ultimate Fighter 15 Lightweight Tournament Title Bout', 'Ultimate Fighter Australia vs. UK Lightweight Tournament Title Bout',
                    'Ultimate Fighter Brazil 4 Lightweight Tournament Title Bout', 'Ultimate Fighter Latin America 2 Lightweight Tournament Title Bout',
                    'Ultimate Fighter 22 Lightweight Tournament Title Bout', 'Ultimate Fighter Latin America 3 Lightweight Tournament Title Bout',
                    'Ultimate Fighter 27 Lightweight Tournament Title Bout', 'UFC Interim Lightweight Title Bout'],
    'Featherweight': ['Featherweight Bout', 'UFC Featherweight Title Bout', 'Ultimate Fighter Brazil 1 Featherweight Tournament Title Bout',
                      'Ultimate Fighter 14 Featherweight Tournament Title Bout', 'Ultimate Fighter China Featherweight Tournament Title Bout',
                      'Ultimate Fighter Latin America Featherweight Tournament Title Bout', 'UFC Interim Featherweight Title Bout',
                      'Ultimate Fighter 27 Featherweight Tournament Title Bout'],
    'Bantamweight': ['Bantamweight Bout', 'UFC Bantamweight Title Bout', 'UFC Interim Bantamweight Title Bout',
                     'Ultimate Fighter 14 Bantamweight Tournament Title Bout', 'Ultimate Fighter 18 Bantamweight Tournament Title Bout',
                     'Ultimate Fighter Latin America Bantamweight Tournament Title Bout', 'Ultimate Fighter Brazil 4 Bantamweight Tournament Title Bout'],
    'Flyweight': ['Flyweight Bout', 'UFC Flyweight Title Bout', 'UFC Interim Flyweight Title Bout'],
    'Women\'s Bantamweight': ['UFC Women\'s Bantamweight Title Bout', 'Women\'s Bantamweight Bout', 'Ultimate Fighter 18 Women\'s Bantamweight Tournament Title Bout'],
    'Women\'s Featherweight': ['UFC Women\'s Featherweight Title Bout', 'Women\'s Featherweight Bout', 'Ultimate Fighter 28 Women\'s Featherweight Tournament Title Bout'],
    'Women\'s Flyweight': ['Women\'s Flyweight Bout', 'UFC Women\'s Flyweight Title Bout'],
    'Women\'s Strawweight': ['Women\'s Strawweight Bout', 'UFC Women\'s Strawweight Title Bout', 'Ultimate Fighter 23 Women\'s Strawweight Tournament Title Bout'],
    'Catch Weight': ['Catch Weight Bout'],
    'Other/Tournament Bouts': ['UFC 2 Tournament Title Bout', 'UFC 3 Tournament Title Bout', 'UFC 4 Tournament Title Bout', 'UFC Superfight Championship Bout',
                               'UFC 5 Tournament Title Bout', 'UFC 6 Tournament Title Bout', 'UFC 7 Tournament Title Bout', "Ultimate Ultimate '95 Tournament Title Bout",
                               'UFC 10 Tournament Title Bout', 'UFC 8 Tournament Title Bout', "Ultimate Ultimate '96 Tournament Title Bout"]
}

# Assuming you have a DataFrame 'df' with a 'WEIGHTCLASS' column
def map_weight_class(weight_class):
    for category, weight_classes in weight_class_mapping.items():
        if weight_class in weight_classes:
            return category
    return 'Other'

df_merge['weight_class'] = df_merge['WEIGHTCLASS'].apply(map_weight_class)

In [32]:
df_merge['TIME FORMAT'].unique()

array(['No Time Limit', '1 Rnd (20)', '1 Rnd + OT (31-5)', '1 Rnd (30)',
       '1 Rnd + OT (30-5)', '1 Rnd + OT (30-3)', '1 Rnd (15)',
       '1 Rnd (18)', '1 Rnd + OT (27-3)', '1 Rnd (10)',
       '1 Rnd + OT (12-3)', '1 Rnd + 2OT (24-3-3)',
       '1 Rnd + 2OT (15-3-3)', '1 Rnd + OT (15-3)', '1 Rnd (12)',
       '2 Rnd (5-5)', '3 Rnd (5-5-5)', '5 Rnd (5-5-5-5-5)',
       '3 Rnd + OT (5-5-5-5)'], dtype=object)

In [33]:
df_merge['format'] = df_merge['TIME FORMAT'].apply(lambda x: x[0])
df_merge['format'] = df_merge['format'].replace('N', 0)
df_merge['format'] = df_merge['format'].astype(int)

In [34]:
df_merge

Unnamed: 0,BOUT,FIGHTER,Round 1_KD,Round 1_SIG.STR._Success,Round 1_SIG.STR._Attempt,Round 1_SIG.STR. %,Round 1_TOTAL STR._Success,Round 1_TOTAL STR._Attempt,Round 1_TD_Success,Round 1_TD_Attempt,...,WEIGHTCLASS,METHOD,ROUND,TIME,TIME FORMAT,REFEREE,DETAILS,URL,weight_class,format
0,scott morris vs. sean daugherty,Sean Daugherty,0,0,4,0.000000,1,5,0,0,...,Open Weight Bout,Submission,1,0:20,No Time Limit,John McCarthy,Guillotine Choke From Mount,http://ufcstats.com/fight-details/4acab67848e7...,Open Weight,0
1,scott morris vs. sean daugherty,Scott Morris,0,1,1,1.000000,2,2,1,1,...,Open Weight Bout,Submission,1,0:20,No Time Limit,John McCarthy,Guillotine Choke From Mount,http://ufcstats.com/fight-details/4acab67848e7...,Open Weight,0
2,patrick smith vs. ray wizard,Ray Wizard,0,1,1,1.000000,2,2,0,0,...,Open Weight Bout,Submission,1,0:58,No Time Limit,John McCarthy,Guillotine Choke Standing,http://ufcstats.com/fight-details/4b9ae533ccb3...,Open Weight,0
3,patrick smith vs. ray wizard,Patrick Smith,0,1,1,1.000000,1,1,0,1,...,Open Weight Bout,Submission,1,0:58,No Time Limit,John McCarthy,Guillotine Choke Standing,http://ufcstats.com/fight-details/4b9ae533ccb3...,Open Weight,0
4,johnny rhodes vs. david levicki,David Levicki,0,4,5,0.800000,95,102,0,0,...,Open Weight Bout,KO/TKO,1,12:13,No Time Limit,John McCarthy,Punches to Head From GuardSubmission to Strikes,http://ufcstats.com/fight-details/ccee020be2e8...,Open Weight,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15573,alexa grasso vs. valentina shevchenko,Valentina Shevchenko,0,14,40,0.350000,25,51,1,2,...,UFC Women's Flyweight Title Bout,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),Mark Smith,Ben Cartlidge 45 - 50.Eric Colon 45 - 50.Ron M...,http://ufcstats.com/fight-details/6f83cfae0c80...,Women's Flyweight,5
15574,alexa grasso vs. valentina shevchenko,Alexa Grasso,0,12,41,0.292683,60,89,0,0,...,UFC Women's Flyweight Title Bout,Decision - Split,5,5:00,5 Rnd (5-5-5-5-5),Herb Dean,Junichiro Kamijo 47 - 48.Sal D'amato 48 - 47.M...,http://ufcstats.com/fight-details/b395c89e19a3...,Women's Flyweight,5
15575,alexa grasso vs. valentina shevchenko,Alexa Grasso,0,12,41,0.292683,60,89,0,0,...,UFC Women's Flyweight Title Bout,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),Mark Smith,Ben Cartlidge 45 - 50.Eric Colon 45 - 50.Ron M...,http://ufcstats.com/fight-details/6f83cfae0c80...,Women's Flyweight,5
15576,sean o'malley vs. merab dvalishvili,Merab Dvalishvili,0,16,30,0.533333,25,41,2,3,...,UFC Bantamweight Title Bout,Decision - Unanimous,5,5:00,5 Rnd (5-5-5-5-5),Herb Dean,Mike Bell 46 - 49.Sal D'amato 47 - 48.Chris Le...,http://ufcstats.com/fight-details/3146e5a47a92...,Bantamweight,5


In [35]:
df_merge['TIME'] = df_merge['TIME'].apply(convert_to_seconds)

In [36]:
df_merge

Unnamed: 0,BOUT,FIGHTER,Round 1_KD,Round 1_SIG.STR._Success,Round 1_SIG.STR._Attempt,Round 1_SIG.STR. %,Round 1_TOTAL STR._Success,Round 1_TOTAL STR._Attempt,Round 1_TD_Success,Round 1_TD_Attempt,...,WEIGHTCLASS,METHOD,ROUND,TIME,TIME FORMAT,REFEREE,DETAILS,URL,weight_class,format
0,scott morris vs. sean daugherty,Sean Daugherty,0,0,4,0.000000,1,5,0,0,...,Open Weight Bout,Submission,1,20,No Time Limit,John McCarthy,Guillotine Choke From Mount,http://ufcstats.com/fight-details/4acab67848e7...,Open Weight,0
1,scott morris vs. sean daugherty,Scott Morris,0,1,1,1.000000,2,2,1,1,...,Open Weight Bout,Submission,1,20,No Time Limit,John McCarthy,Guillotine Choke From Mount,http://ufcstats.com/fight-details/4acab67848e7...,Open Weight,0
2,patrick smith vs. ray wizard,Ray Wizard,0,1,1,1.000000,2,2,0,0,...,Open Weight Bout,Submission,1,58,No Time Limit,John McCarthy,Guillotine Choke Standing,http://ufcstats.com/fight-details/4b9ae533ccb3...,Open Weight,0
3,patrick smith vs. ray wizard,Patrick Smith,0,1,1,1.000000,1,1,0,1,...,Open Weight Bout,Submission,1,58,No Time Limit,John McCarthy,Guillotine Choke Standing,http://ufcstats.com/fight-details/4b9ae533ccb3...,Open Weight,0
4,johnny rhodes vs. david levicki,David Levicki,0,4,5,0.800000,95,102,0,0,...,Open Weight Bout,KO/TKO,1,733,No Time Limit,John McCarthy,Punches to Head From GuardSubmission to Strikes,http://ufcstats.com/fight-details/ccee020be2e8...,Open Weight,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15573,alexa grasso vs. valentina shevchenko,Valentina Shevchenko,0,14,40,0.350000,25,51,1,2,...,UFC Women's Flyweight Title Bout,Decision - Unanimous,5,300,5 Rnd (5-5-5-5-5),Mark Smith,Ben Cartlidge 45 - 50.Eric Colon 45 - 50.Ron M...,http://ufcstats.com/fight-details/6f83cfae0c80...,Women's Flyweight,5
15574,alexa grasso vs. valentina shevchenko,Alexa Grasso,0,12,41,0.292683,60,89,0,0,...,UFC Women's Flyweight Title Bout,Decision - Split,5,300,5 Rnd (5-5-5-5-5),Herb Dean,Junichiro Kamijo 47 - 48.Sal D'amato 48 - 47.M...,http://ufcstats.com/fight-details/b395c89e19a3...,Women's Flyweight,5
15575,alexa grasso vs. valentina shevchenko,Alexa Grasso,0,12,41,0.292683,60,89,0,0,...,UFC Women's Flyweight Title Bout,Decision - Unanimous,5,300,5 Rnd (5-5-5-5-5),Mark Smith,Ben Cartlidge 45 - 50.Eric Colon 45 - 50.Ron M...,http://ufcstats.com/fight-details/6f83cfae0c80...,Women's Flyweight,5
15576,sean o'malley vs. merab dvalishvili,Merab Dvalishvili,0,16,30,0.533333,25,41,2,3,...,UFC Bantamweight Title Bout,Decision - Unanimous,5,300,5 Rnd (5-5-5-5-5),Herb Dean,Mike Bell 46 - 49.Sal D'amato 47 - 48.Chris Le...,http://ufcstats.com/fight-details/3146e5a47a92...,Bantamweight,5


In [37]:
df_merge['METHOD'].unique()

array(['Submission ', 'KO/TKO ', 'Other ', 'Decision - Unanimous ',
       "TKO - Doctor's Stoppage ", 'Decision - Split ', 'Overturned ',
       'Decision - Majority ', 'DQ ', 'Could Not Continue '], dtype=object)

In [38]:
outcome_mapping = {
    'sub': ['Submission '],
    'KO': ['KO/TKO ', 'TKO - Doctor\'s Stoppage '],
    'split': ['Decision - Split ', 'Decision - Majority '],
    'unanimous': ['Decision - Unanimous '],
    'NC': ['Could Not Continue ', 'Overturned ', 'Other '],
}


# Assuming you have a DataFrame 'df' with a 'WEIGHTCLASS' column
def map_outcome(outcome):
    for category, outcomes in outcome_mapping.items():
        if outcome in outcomes:
            return category
    return 'Other'

df_merge['METHOD'] = df_merge['METHOD'].apply(map_outcome)


In [39]:
df_merge.to_csv('test.csv')

In [40]:
print(df_merge.columns)

Index(['BOUT', 'FIGHTER', 'Round 1_KD', 'Round 1_SIG.STR._Success',
       'Round 1_SIG.STR._Attempt', 'Round 1_SIG.STR. %',
       'Round 1_TOTAL STR._Success', 'Round 1_TOTAL STR._Attempt',
       'Round 1_TD_Success', 'Round 1_TD_Attempt',
       ...
       'WEIGHTCLASS', 'METHOD', 'ROUND', 'TIME', 'TIME FORMAT', 'REFEREE',
       'DETAILS', 'URL', 'weight_class', 'format'],
      dtype='object', length=159)


In [41]:
columns_to_drop = ['TIME FORMAT', 'REFEREE', 'DETAILS', 'URL', 'WEIGHTCLASS']


df_merge = df_merge.drop(columns = columns_to_drop)

In [42]:
df_merge['EVENT'] = df_merge['EVENT'].str.strip().str.lower()
event_details['EVENT'] = event_details['EVENT'].str.strip().str.lower()

df2 = pd.merge(df_merge, event_details, on = 'EVENT')

In [43]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [44]:
import pandas as pd
import unicodedata
from fuzzywuzzy import fuzz

def standardize_name(name):
    # Convert to lowercase
    name = name.lower()
    # Remove accents
    name = ''.join(c for c in unicodedata.normalize('NFD', name)
                   if unicodedata.category(c) != 'Mn')
    # Remove special characters
    name = ''.join(e for e in name if e.isalnum() or e.isspace())
    # Remove extra spaces
    name = ' '.join(name.split())
    return name

def find_match(name, name_list):
    std_name = standardize_name(name)
    
    # Check for exact match first
    if std_name in name_list:
        return std_name
    
    # Use fuzzy matching
    best_match = None
    best_ratio = 0
    for list_name in name_list:
        ratio = fuzz.ratio(std_name, standardize_name(list_name))
        if ratio > best_ratio and ratio > 85:  # Increased threshold for more strictness
            best_ratio = ratio
            best_match = list_name
    
    return best_match if best_match else name  # Return original name if no match found

# Standardize all names in the standard_names list
standard_names = [standardize_name(name) for name in name_list]

df_test = df2

df_test['Standardized_Name'] = df_test['FIGHTER'].apply(lambda x: find_match(x, standard_names))



In [45]:
df2['FIGHTER'] =  df_test['Standardized_Name']

In [46]:
fighters = pd.read_csv('ufc_fighter_tott.csv')
fighters['FIGHTER'] = fighters['FIGHTER'].str.lower()
fighters['FIGHTER'] = fighters['FIGHTER'].str.replace('-', '')

In [47]:
fighters

Unnamed: 0,FIGHTER,HEIGHT,WEIGHT,REACH,STANCE,DOB,URL
0,tom aaron,--,155 lbs.,--,,"Jul 13, 1978",http://ufcstats.com/fighter-details/93fe7332d1...
1,danny abbadi,"5' 11""",155 lbs.,--,Orthodox,"Jul 03, 1983",http://ufcstats.com/fighter-details/15df64c02b...
2,david abbott,"6' 0""",265 lbs.,--,Switch,--,http://ufcstats.com/fighter-details/b361180739...
3,shamil abdurakhimov,"6' 3""",235 lbs.,"76""",Orthodox,"Sep 02, 1981",http://ufcstats.com/fighter-details/2f5cbecbbe...
4,hiroyuki abe,"5' 6""",145 lbs.,--,Orthodox,--,http://ufcstats.com/fighter-details/c0ed7b2081...
...,...,...,...,...,...,...,...
4253,mohamed ado,--,170 lbs.,--,,"May 03, 2000",http://ufcstats.com/fighter-details/7a846fcbf1...
4254,yadier delvalle,--,155 lbs.,--,,"Jul 29, 1996",http://ufcstats.com/fighter-details/70380ccdc8...
4255,jonathan micallef,--,170 lbs.,--,,"Mar 05, 1999",http://ufcstats.com/fighter-details/f782f953bf...
4256,antonio monteiro,--,145 lbs.,--,,"Jul 02, 1995",http://ufcstats.com/fighter-details/3e38b1ea16...


In [49]:
df_test = pd.merge(df2, fighters, on = 'FIGHTER',how='left')

In [50]:
df_test['FIGHTER'] = df_test['FIGHTER'].str.lower()

In [51]:
print(df_test.isnull().sum().to_string())

BOUT                            0
FIGHTER                         0
Round 1_KD                      0
Round 1_SIG.STR._Success        0
Round 1_SIG.STR._Attempt        0
Round 1_SIG.STR. %              0
Round 1_TOTAL STR._Success      0
Round 1_TOTAL STR._Attempt      0
Round 1_TD_Success              0
Round 1_TD_Attempt              0
Round 1_TD %                    0
Round 1_SUB.ATT                 0
Round 1_REV.                    0
Round 1_CTRL                    0
Round 1_HEAD_Success            0
Round 1_HEAD_Attempt            0
Round 1_BODY_Success            0
Round 1_BODY_Attempt            0
Round 1_LEG_Success             0
Round 1_LEG_Attempt             0
Round 1_DISTANCE_Success        0
Round 1_DISTANCE_Attempt        0
Round 1_CLINCH_Success          0
Round 1_CLINCH_Attempt          0
Round 1_GROUND_Success          0
Round 1_GROUND_Attempt          0
OPPONENT                        0
Round 2_KD                      0
Round 2_SIG.STR._Success        0
Round 2_SIG.ST

In [52]:
df_test['STANCE'] = df_test['STANCE'].fillna('none')

In [53]:
df_test['OPPONENT'] = df_test['OPPONENT'].str.lower()

In [54]:
# Merge the DataFrame with itself
merged_df = pd.merge(
    df_test, df_test,
    left_on=['BOUT', 'OPPONENT'],
    right_on=['BOUT', 'FIGHTER'],
    suffixes=('', '_OPPONENT')
)

In [55]:
merged_df.to_csv('oppdata.csv')

In [56]:
# Convert 'DATE' column to datetime format
merged_df['DATE'] = pd.to_datetime(merged_df['DATE'])

# Sort DataFrame by 'DATE'
merged_df = merged_df.sort_values(by='DATE')

In [57]:
import numpy as np

merged_df['gender'] = np.where(merged_df['weight_class'].str.contains('women', case=False, na=False), 0, 1)

In [58]:
# Function to determine the win/loss for the fighter
def determine_outcome(row):
    fighter = row['FIGHTER']
    bout = row['BOUT']
    outcome = row['OUTCOME']

    # Split the bout to get both fighters
    fighters = bout.split(' vs. ')
    if outcome == 'W/L':
        return 'W' if fighter == fighters[0] else 'L'
    elif outcome == 'L/W':
        return 'W' if fighter == fighters[1] else 'L'
    else:
        return 'D/ND'

# Apply the function to each row to get the win/loss for the fighter
merged_df['WIN/LOSS'] = merged_df.apply(determine_outcome, axis=1)

In [59]:

drop = ['URL_y',  'FIGHTER_OPPONENT', 'OPPONENT_OPPONENT', 'EVENT_OPPONENT' ,'OUTCOME_OPPONENT',  
        'METHOD_OPPONENT', 'ROUND_OPPONENT', 'TIME_OPPONENT',	'URL_x_OPPONENT',	'weight_class_OPPONENT',	'format_OPPONENT',	'URL_y_OPPONENT',	'DATE_OPPONENT',	'LOCATION_OPPONENT',	
       ]

merged_df = merged_df.drop(columns = drop)

In [60]:
# Convert 'DATE' and 'DOB' columns to datetime format
merged_df['DATE'] = pd.to_datetime(merged_df['DATE'])
merged_df['DOB'] = pd.to_datetime(merged_df['DOB'], errors='coerce')
merged_df['DOB_OPPONENT'] = pd.to_datetime(merged_df['DOB_OPPONENT'], errors='coerce')

# Calculate age at the time of the fight
merged_df['AGE'] = merged_df.apply(lambda row: row['DATE'].year - row['DOB'].year - ((row['DATE'].month, row['DATE'].day) < (row['DOB'].month, row['DOB'].day)), axis=1)
merged_df['AGE_OPPONENT'] = merged_df.apply(lambda row: row['DATE'].year - row['DOB_OPPONENT'].year - ((row['DATE'].month, row['DATE'].day) < (row['DOB_OPPONENT'].month, row['DOB_OPPONENT'].day)), axis=1)

  merged_df['DOB_OPPONENT'] = pd.to_datetime(merged_df['DOB_OPPONENT'], errors='coerce')


In [61]:
# List of columns to calculate per-minute stats for the fighter
totals_counting = [
    'Total_KD', 'Total_SIG.STR._Success', 'Total_TOTAL STR._Success',
    'Total_SIG.STR._Attempt', 'Total_TOTAL STR._Attempt', 'Total_TD_Success',
    'Total_TD_Attempt', 'Total_SUB.ATT', 'Total_REV.', 'Total_CTRL',
    'Total_HEAD_Success', 'Total_HEAD_Attempt', 'Total_BODY_Success',
    'Total_BODY_Attempt', 'Total_LEG_Success', 'Total_LEG_Attempt',
    'Total_DISTANCE_Success', 'Total_DISTANCE_Attempt', 'Total_CLINCH_Success',
    'Total_CLINCH_Attempt', 'Total_GROUND_Success', 'Total_GROUND_Attempt'
]

# List of columns to calculate per-minute stats for the opponent
totals_counting_opp = [
    'Total_KD_OPPONENT', 'Total_SIG.STR._Success_OPPONENT', 'Total_TOTAL STR._Success_OPPONENT',
    'Total_SIG.STR._Attempt_OPPONENT', 'Total_TOTAL STR._Attempt_OPPONENT', 'Total_TD_Success_OPPONENT',
    'Total_TD_Attempt_OPPONENT', 'Total_SUB.ATT_OPPONENT', 'Total_REV._OPPONENT', 'Total_CTRL_OPPONENT',
    'Total_HEAD_Success_OPPONENT', 'Total_HEAD_Attempt_OPPONENT', 'Total_BODY_Success_OPPONENT',
    'Total_BODY_Attempt_OPPONENT', 'Total_LEG_Success_OPPONENT', 'Total_LEG_Attempt_OPPONENT',
    'Total_DISTANCE_Success_OPPONENT', 'Total_DISTANCE_Attempt_OPPONENT', 'Total_CLINCH_Success_OPPONENT',
    'Total_CLINCH_Attempt_OPPONENT', 'Total_GROUND_Success_OPPONENT', 'Total_GROUND_Attempt_OPPONENT'
]


for column in totals_counting:
    merged_df[column + '_per_minute'] = merged_df[column] / ((merged_df['TIME'] + (300* merged_df['ROUND'])) / 60)

for column in totals_counting_opp:
    merged_df[column + '_per_minute'] = merged_df[column] / ((merged_df['TIME'] + (300* merged_df['ROUND'])) / 60)

In [62]:
from datetime import datetime, timedelta

# Get current year
current_year = datetime.now().year

# Calculate 15 years ago
fifteen_years_ago = current_year - 15

# Filter DataFrame for data from the last 15 years
df_last_15_years = merged_df[merged_df['DATE'].dt.year >= fifteen_years_ago]

In [63]:
merged_df.to_csv('test.csv')

In [64]:
# Filter DataFrame for men's fights in the last 15 years
df_mens_fights_last_15_years = df_last_15_years[df_last_15_years['gender'] == 1]
df_womens_fights_last_15_years = df_last_15_years[df_last_15_years['gender'] == 0]

In [74]:
df_mens_fights_last_15_years.to_csv('last_15.csv')

In [82]:
# List of all unique weight classes in the dataset
weight_classes = df_mens_fights_last_15_years['weight_class'].unique()

# Create a dictionary to store filtered DataFrames for each weight class
weight_class_datasets = {}

# Iterate over each weight class
for weight_class in weight_classes:
    # Filter the dataset for the current weight class
    filtered_df = df_mens_fights_last_15_years[df_mens_fights_last_15_years['weight_class'] == weight_class]
    
    # Store the filtered DataFrame in the dictionary with the weight class as key
    weight_class_datasets[weight_class] = filtered_df

In [86]:
# List of all unique weight classes in the dataset
womens_weight_classes = df_womens_fights_last_15_years['weight_class'].unique()

# Create a dictionary to store filtered DataFrames for each weight class
weight_class_datasets = {}

# Iterate over each weight class
for weight_class in weight_classes:
    # Filter the dataset for the current weight class
    filtered_df = df_mens_fights_last_15_years[df_mens_fights_last_15_years['weight_class'] == weight_class]
    
    # Store the filtered DataFrame in the dictionary with the weight class as key
    weight_class_datasets[weight_class] = filtered_df

In [87]:
import pandas as pd

def separate_recent_and_previous_fights(df, weight_classes):
    recent_fights_datasets = {}
    previous_fights_datasets = {}
    recent_5_fights_datasets = {}
    
    for weight_class in weight_classes:
        filtered_df = df[df['weight_class'] == weight_class]
        filtered_df = filtered_df.drop_duplicates(subset=['BOUT', 'FIGHTER', 'DATE'])
        
        fight_counts = filtered_df['FIGHTER'].value_counts()
        eligible_fighters = fight_counts[fight_counts >= 4].index
        
        filtered_df = filtered_df[filtered_df['FIGHTER'].isin(eligible_fighters)]
        filtered_df = filtered_df.sort_values(by=['FIGHTER', 'DATE'])
        
        recent_fights = []
        previous_fights = []
        recent_5_fights = []
        
        grouped = filtered_df.groupby('FIGHTER')
        for fighter, group in grouped:
            num_fights = len(group)
            if num_fights >= 4:
                recent_fights.append(group.iloc[-1])
                
                if num_fights >= 6:
                    previous_fights.append(group.iloc[-6:-1])
                elif num_fights == 5:
                    previous_fights.append(group.iloc[-5:-1])
                else:  # num_fights == 4
                    previous_fights.append(group.iloc[-4:-1])
                
                if num_fights >= 5:
                    recent_5_fights.append(group.iloc[-5:])
                elif num_fights == 4:
                    recent_5_fights.append(group.iloc[-4:])
                else:  # num_fights == 3
                    recent_5_fights.append(group.iloc[-3:])
        
        if recent_fights:
            recent_fights_df = pd.DataFrame(recent_fights)
        else:
            recent_fights_df = pd.DataFrame()
        
        if previous_fights:
            previous_fights_df = pd.concat(previous_fights)
        else:
            previous_fights_df = pd.DataFrame()
        
        if recent_5_fights:
            recent_5_fights_df = pd.concat(recent_5_fights)
        else:
            recent_5_fights_df = pd.DataFrame()
        
        recent_fights_datasets[weight_class] = recent_fights_df
        previous_fights_datasets[weight_class] = previous_fights_df
        recent_5_fights_datasets[weight_class] = recent_5_fights_df
    
    return recent_fights_datasets, previous_fights_datasets, recent_5_fights_datasets

# Example usage
weight_classes = ['Welterweight', 'Light Heavyweight', 'Middleweight', 'Lightweight',
                  'Heavyweight', 'Catch Weight', 'Featherweight', 'Bantamweight',
                  'Flyweight', 'Other']
recent_fights_datasets, previous_fights_datasets, recent_5_fights_datasets = separate_recent_and_previous_fights(df_mens_fights_last_15_years, weight_classes)

# Access the datasets for a specific weight class
welterweight_recent_fights = recent_fights_datasets['Welterweight']
welterweight_previous_fights = previous_fights_datasets['Welterweight']
welterweight_recent_5_fights = recent_5_fights_datasets['Welterweight']


In [90]:
import pandas as pd

def separate_recent_and_previous_fights_combined(df, weight_classes):
    combined_recent_fights = []
    combined_previous_fights = []
    combined_recent_3_fights = []  # Up to 3 most recent fights
    combined_second_recent_fights = []
    combined_prior_fights = []
    
    for weight_class in weight_classes:
        filtered_df = df[df['weight_class'] == weight_class]
        filtered_df = filtered_df.drop_duplicates(subset=['BOUT', 'FIGHTER', 'DATE'])
        
        fight_counts = filtered_df['FIGHTER'].value_counts()
        eligible_fighters = fight_counts[fight_counts >= 1].index  # Include fighters with at least 1 fight
        
        filtered_df = filtered_df[filtered_df['FIGHTER'].isin(eligible_fighters)]
        filtered_df = filtered_df.sort_values(by=['FIGHTER', 'DATE'])
        
        recent_fights = []
        previous_fights = []
        recent_3_fights = []
        second_recent_fights = []
        prior_fights = []
        
        grouped = filtered_df.groupby('FIGHTER')
        for fighter, group in grouped:
            num_fights = len(group)
            
            recent_fights.append(group.iloc[-1])
            
            if num_fights >= 2:
                second_recent_fights.append(group.iloc[-2])
            
            recent_3_fights.append(group.iloc[-3:])  # Include up to 3 most recent fights
            
            if num_fights >= 6:
                previous_fights.append(group.iloc[-6:-1])
                prior_fights.append(group.iloc[-7:-2])
            elif num_fights == 5:
                previous_fights.append(group.iloc[-5:-1])
                prior_fights.append(group.iloc[-5:-2])
            elif num_fights == 4:
                previous_fights.append(group.iloc[-4:-1])
                prior_fights.append(group.iloc[-4:-2])
        
        combined_recent_fights.extend(recent_fights)
        combined_previous_fights.extend(previous_fights)
        combined_recent_3_fights.extend(recent_3_fights)
        combined_second_recent_fights.extend(second_recent_fights)
        combined_prior_fights.extend(prior_fights)
    
    combined_recent_fights_df = pd.DataFrame(combined_recent_fights)
    combined_previous_fights_df = pd.concat(combined_previous_fights) if combined_previous_fights else pd.DataFrame()
    combined_recent_3_fights_df = pd.concat(combined_recent_3_fights) if combined_recent_3_fights else pd.DataFrame()
    combined_second_recent_fights_df = pd.DataFrame(combined_second_recent_fights)
    combined_prior_fights_df = pd.concat(combined_prior_fights) if combined_prior_fights else pd.DataFrame()
    
    return (combined_recent_fights_df, combined_previous_fights_df, combined_recent_3_fights_df, 
            combined_second_recent_fights_df, combined_prior_fights_df)

# Example usage
weight_classes = ['Welterweight', 'Light Heavyweight', 'Middleweight', 'Lightweight',
                  'Heavyweight', 'Catch Weight', 'Featherweight', 'Bantamweight',
                  'Flyweight', 'Other']


In [94]:
(combined_recent_fights_df, combined_previous_fights_df, combined_recent_3_fights_df,
 combined_second_recent_fights_df, combined_prior_fights_df) = separate_recent_and_previous_fights_combined(df_mens_fights_last_15_years, weight_classes)

In [509]:
(womens_combined_recent_fights_df, womens_combined_previous_fights_df, womens_combined_recent_3_fights_df,
 womens_combined_second_recent_fights_df, womens_combined_prior_fights_df) = separate_recent_and_previous_fights_combined(df_womens_fights_last_15_years, womens_weight_classes)

Standup offense = (KD/minute)*5 + (sig_str_suc * sig_str_acc) + ((tot_str_suc - sig_str_suc) * ((tot_str_suc - sig_str_suc)/(tot_str_att - sig_str_att))

ADD SECOND MOST RECENT FIGHTS

In [95]:
import pandas as pd

def flatten_event_data(event_data):
    flattened_data = []

    for fight in event_data['fights']:
        fight_data = {
            'event_name': event_data['name'],
            'event_date': event_data['date'],
            'event_location': event_data['location'],
            'event_venue': event_data['venue'],
            'weightclass': fight['weightclass'],
            'red_corner_name': fight['red corner']['name'],
            'red_corner_ranking': fight['red corner']['ranking'],
            'red_corner_odds': fight['red corner']['odds'],
            'red_corner_link': fight['red corner']['link'],
            'red_corner_result': fight['red corner']['result'],
            'blue_corner_name': fight['blue corner']['name'],
            'blue_corner_ranking': fight['blue corner']['ranking'],
            'blue_corner_odds': fight['blue corner']['odds'],
            'blue_corner_link': fight['blue corner']['link'],
            'blue_corner_result': fight['blue corner']['result'],
            'round': fight.get('round', None),  # Use .get() to handle potential missing keys
            'time': fight.get('time', None),
            'method': fight.get('method', None)
        }
        flattened_data.append(fight_data)

    return pd.DataFrame(flattened_data)

def get_unique_events(dataset):
    unique_events = dataset['EVENT'].unique()
    all_event_data = pd.DataFrame()
    successful_events = []

    for event_name in unique_events:
        try:
            event_data = get_event(event_name)
            flattened_df = flatten_event_data(event_data)
            all_event_data = pd.concat([all_event_data, flattened_df], ignore_index=True)
            successful_events.append(event_name)
            print(f"Event '{event_name}' processed successfully.")
        except BaseException as e:
            # Log the error or print it for debugging purposes
            print(f"Error processing event '{event_name}': {str(e)}")
            continue  # Skip to the next event if there's an error

    return all_event_data, successful_events

In [1566]:
# Example usage assuming 'combined_recent_fights_df' is your actual dataset DataFrame
all_events_data, successful_events = get_unique_events(combined_recent_fights_df)

Event 'ufc on fx: browne vs bigfoot' processed successfully.
Event 'ufc fight night: felder vs. dos anjos' processed successfully.
Event 'ufc fight night: kara-france vs. albazi' processed successfully.
Event 'ufc fight night: perez vs. taira' processed successfully.
Event 'ufc 166: velasquez vs dos santos 3' processed successfully.
Event 'ufc fight night: fiziev vs. gamrot' processed successfully.
Event 'ufc 94: st-pierre vs penn 2' processed successfully.
Event 'ufc 255: figueiredo vs. perez' processed successfully.
Error processing event 'ufc fight night: kim vs hathaway': list index out of range
Event 'ufc 204: bisping vs. henderson' processed successfully.
Event 'ufc 224: nunes vs. pennington' processed successfully.
Event 'ufc fight night: lee vs. oliveira' processed successfully.
Error processing event 'ufc fight night: oezdemir vs. smith': UFC link not found !
Event 'ufc 302: makhachev vs. poirier' processed successfully.
Event 'ufc 272: covington vs. masvidal' processed succes

In [1572]:
all_events_data2, successful_events2 = get_unique_events(combined_second_recent_fights_df)

Event 'ufc on fuel tv: munoz vs weidman' processed successfully.
Event 'ufc fight night: kattar vs. ige' processed successfully.
Event 'ufc 280: oliveira vs. makhachev' processed successfully.
Event 'ufc 289: nunes vs. aldana' processed successfully.
Error processing event 'ufc on fuel tv: mousasi vs latifi': list index out of range
Event 'ufc fight night: andrade vs. blanchfield' processed successfully.
Error processing event 'ufc 236: holloway vs. poirier 2': list index out of range
Event 'ufc fight night: overeem vs arlovski' processed successfully.
Event 'ufc fight night: dos anjos vs. alvarez' processed successfully.
Event 'ufc fight night: shevchenko vs. carmouche 2' processed successfully.
Event 'ufc fight night: barboza vs. lee' processed successfully.
Error processing event 'ufc fight night: allen vs. curtis 2': list index out of range
Event 'ufc fight night: santos vs. walker' processed successfully.
Event 'ufc on fox: holm vs. shevchenko' processed successfully.
Error proces

In [1573]:
all_events_data = all_events_data.apply(lambda x: x.astype(str).str.lower() if x.dtype == 'object' else x)
all_events_data2 = all_events_data2.apply(lambda x: x.astype(str).str.lower() if x.dtype == 'object' else x)

## Read in past odds data

In [96]:
all_events_data = pd.read_csv('events_odds.csv')
all_events_data2 = pd.read_csv('events_odds2.csv')

In [97]:
all_events_data.to_csv('events_odds.csv')

In [99]:
all_events_data2.to_csv('events_odds2.csv')

In [98]:
import pandas as pd
import numpy as np

def add_odds_data(fight_data, all_events_data):
    fight_data = fight_data.apply(lambda x: x.astype(str).str.lower() if x.dtype == "object" else x)
    all_events_data = all_events_data.apply(lambda x: x.astype(str).str.lower() if x.dtype == "object" else x)
    matched_rows = []
    
    for index, row in fight_data.iterrows():
        fighter = row['FIGHTER']
        opponent = row['OPPONENT']
        
        # Search for the match in all_events_data
        match = all_events_data[
            ((all_events_data['red_corner_name'] == fighter) & (all_events_data['blue_corner_name'] == opponent)) |
            ((all_events_data['blue_corner_name'] == fighter) & (all_events_data['red_corner_name'] == opponent))
        ]
        
        if not match.empty:
            # Determine which odds correspond to FIGHTER and OPPONENT
            if match['red_corner_name'].values[0] == fighter:  # FIGHTER in red corner
                red_corner_odds = match['red_corner_odds'].values[0]
                blue_corner_odds = match['blue_corner_odds'].values[0]
            else:  # FIGHTER in blue corner
                red_corner_odds = match['blue_corner_odds'].values[0]
                blue_corner_odds = match['red_corner_odds'].values[0]
            
            # Remove row if both odds are '-' or '--'
            if red_corner_odds in ['-', '--'] and blue_corner_odds in ['-', '--']:
                continue
            
            # Handle non-standard minus sign (−) and '--' in odds
            def clean_odds(odds):
                if odds.startswith('−'):
                    return '-' + odds[1:]
                elif odds == '--':
                    return '-'
                return odds
            
            red_corner_odds = clean_odds(red_corner_odds)
            blue_corner_odds = clean_odds(blue_corner_odds)
            
            # Calculate implied probability for FIGHTER
            fighter_win_percent = None
            if red_corner_odds not in ['-', '--']:
                fighter_odds = float(red_corner_odds)
                if fighter_odds > 0:
                    fighter_win_percent = 100 / (fighter_odds + 100) * 100
                else:
                    fighter_win_percent = -fighter_odds / (-fighter_odds + 100) * 100
            elif blue_corner_odds not in ['-', '--']:
                opponent_odds = float(blue_corner_odds)
                if opponent_odds > 0:
                    fighter_win_percent = 100 - (100 / (opponent_odds + 100) * 100)
                else:
                    fighter_win_percent = 100 - (-opponent_odds / (-opponent_odds + 100) * 100)
            
            matched_rows.append({
                'FIGHTER': fighter,
                'OPPONENT': opponent,
                'FIGHTER_ODDS': red_corner_odds,
                'OPPONENT_ODDS': blue_corner_odds,
                'FIGHTER_WIN_PERCENT': fighter_win_percent
            })
    
    updated_fight_data = pd.DataFrame(matched_rows)
    
    # Merge with the original fight_data
    updated_fight_data = pd.merge(fight_data, updated_fight_data, on=['FIGHTER', 'OPPONENT'], how='inner')
    
    return updated_fight_data


In [100]:
# Assuming you have fight_data and all_events_data DataFrames ready
recent_fights1 = add_odds_data(combined_recent_fights_df, all_events_data)
recent_fights2 = add_odds_data(combined_second_recent_fights_df, all_events_data2)

recent_fights = pd.concat([recent_fights1, recent_fights2], ignore_index=True)

In [101]:
recent_fights1.to_csv('test.csv')

In [102]:
import pandas as pd
import numpy as np

def process_division_data(division_previous_fights):
    # Extract division name from the input variable name
    division_name = [name for name in globals() if globals()[name] is division_previous_fights][0].split('_')[0]

    def count_ko_wins(group):
        return ((group['METHOD'] == 'KO') & (group['WIN/LOSS'] == 'W')).sum()

    def count_ko_loss(group):
        return ((group['METHOD'] == 'KO') & (group['WIN/LOSS'] == 'L')).sum()

    def count_sub_wins(group):
        return ((group['METHOD'] == 'sub') & (group['WIN/LOSS'] == 'W')).sum()

    def count_sub_loss(group):
        return ((group['METHOD'] == 'sub') & (group['WIN/LOSS'] == 'L')).sum()

    fighter_stats = division_previous_fights.groupby('FIGHTER').agg({
        'weight_class': 'first',
        'Total_KD_per_minute': 'mean',
        'Total_SIG.STR._Success_per_minute': 'mean',
        'Total_SIG.STR. %': 'mean',
        'Total_TOTAL STR._Success_per_minute': 'mean',
        'Total_TOTAL STR._Attempt_per_minute': 'mean',
        'Total_SIG.STR._Attempt_per_minute': 'mean',
        'Total_DISTANCE_Attempt_per_minute': 'mean',
        'Total_DISTANCE_Success_per_minute': 'mean',
        'Total_HEAD_Success_per_minute': 'mean',
        'Total_HEAD_Attempt_per_minute': 'mean',
        'Total_GROUND_Success_per_minute': 'mean',
        'Total_GROUND_Attempt_per_minute': 'mean',
        'Total_KD_OPPONENT_per_minute': 'mean',
        'Total_SIG.STR._Success_OPPONENT_per_minute': 'mean',
        'Total_SIG.STR. %_OPPONENT': 'mean',
        'Total_TOTAL STR._Success_OPPONENT_per_minute': 'mean',
        'Total_TOTAL STR._Attempt_OPPONENT_per_minute': 'mean',
        'Total_SIG.STR._Attempt_OPPONENT_per_minute': 'mean',
        'Total_DISTANCE_Attempt_OPPONENT_per_minute': 'mean',
        'Total_DISTANCE_Success_OPPONENT_per_minute': 'mean',
        'Total_HEAD_Success_OPPONENT_per_minute': 'mean',
        'Total_HEAD_Attempt_OPPONENT_per_minute': 'mean',
        'Total_GROUND_Success_OPPONENT_per_minute': 'mean',
        'Total_GROUND_Attempt_OPPONENT_per_minute': 'mean',
        'Total_TD_Success_per_minute': 'mean',
        'Total_TD_Attempt_per_minute': 'mean',
        'Total_SUB.ATT_per_minute': 'mean',
        'Total_REV._per_minute': 'mean',
        'Total_CTRL_per_minute': 'mean',
        'Total_BODY_Success_per_minute': 'mean',
        'Total_BODY_Attempt_per_minute': 'mean',
        'Total_LEG_Success_per_minute': 'mean',
        'Total_LEG_Attempt_per_minute': 'mean',
        'Total_CLINCH_Success_per_minute': 'mean',
        'Total_CLINCH_Attempt_per_minute': 'mean',
        'Total_TD_Success_OPPONENT_per_minute': 'mean',
        'Total_TD_Attempt_OPPONENT_per_minute': 'mean',
        'Total_SUB.ATT_OPPONENT_per_minute': 'mean',
        'Total_REV._OPPONENT_per_minute': 'mean',
        'Total_CTRL_OPPONENT_per_minute': 'mean',
        'Total_BODY_Success_OPPONENT_per_minute': 'mean',
        'Total_BODY_Attempt_OPPONENT_per_minute': 'mean',
        'Total_LEG_Success_OPPONENT_per_minute': 'mean',
        'Total_LEG_Attempt_OPPONENT_per_minute': 'mean',
        'Total_CLINCH_Success_OPPONENT_per_minute': 'mean',
        'Total_CLINCH_Attempt_OPPONENT_per_minute': 'mean'
    }).reset_index()

    ko_wins = division_previous_fights.groupby('FIGHTER').apply(count_ko_wins).reset_index(name='KO_win_count')
    ko_loss = division_previous_fights.groupby('FIGHTER').apply(count_ko_loss).reset_index(name='KO_loss_count')
    sub_wins = division_previous_fights.groupby('FIGHTER').apply(count_sub_wins).reset_index(name='sub_win_count')
    sub_loss = division_previous_fights.groupby('FIGHTER').apply(count_sub_loss).reset_index(name='sub_loss_count')

    fighter_stats = fighter_stats.merge(ko_wins, on='FIGHTER')
    fighter_stats = fighter_stats.merge(ko_loss, on='FIGHTER')
    fighter_stats = fighter_stats.merge(sub_wins, on='FIGHTER')
    fighter_stats = fighter_stats.merge(sub_loss, on='FIGHTER')

    fighter_stats['Power'] = fighter_stats['Total_KD_per_minute'] / fighter_stats['Total_TOTAL STR._Success_per_minute']

    fighter_stats['Standup_offense'] = (
        (fighter_stats['Total_DISTANCE_Success_per_minute'] / 
         fighter_stats['Total_DISTANCE_Attempt_per_minute']) * 100 +
        fighter_stats['Total_DISTANCE_Success_per_minute'] * 2 +
        fighter_stats['Total_HEAD_Success_per_minute'] * 1.5 +
        fighter_stats['Total_KD_per_minute'] * 6 +
        fighter_stats['Total_SIG.STR._Success_per_minute'] * 0.5 +
        (fighter_stats['Total_BODY_Success_per_minute'] + 
         fighter_stats['Total_LEG_Success_per_minute']) * 0.75
    )

    fighter_stats['Standup_defense'] = (
        (fighter_stats['Total_KD_OPPONENT_per_minute'] * -100) +
        (fighter_stats['Total_SIG.STR._Success_OPPONENT_per_minute'] * fighter_stats['Total_SIG.STR. %_OPPONENT']) * 2 +
         (fighter_stats['KO_loss_count'] * -10) - (fighter_stats['Total_DISTANCE_Success_OPPONENT_per_minute'] * (fighter_stats['Total_DISTANCE_Success_OPPONENT_per_minute'] / fighter_stats['Total_DISTANCE_Attempt_OPPONENT_per_minute']) * 20) +
        (fighter_stats['Total_HEAD_Success_OPPONENT_per_minute'] * (fighter_stats['Total_HEAD_Success_OPPONENT_per_minute'] / fighter_stats['Total_HEAD_Attempt_OPPONENT_per_minute'])) * 4
    )

    fighter_stats['Ground_offense'] = (
        (fighter_stats['Total_TD_Success_per_minute'] * 100) + (fighter_stats['sub_win_count'] * 15) + (fighter_stats['Total_SUB.ATT_per_minute']) * 40 + (fighter_stats['Total_CTRL_per_minute'] * 3))

    fighter_stats['Ground_defense'] = (
        ((fighter_stats['Total_TD_Success_OPPONENT_per_minute'] / fighter_stats['Total_TD_Attempt_OPPONENT_per_minute']) * -100) - (fighter_stats['sub_loss_count'] * 15) - (fighter_stats['Total_SUB.ATT_OPPONENT_per_minute']  * 20) - (fighter_stats['Total_GROUND_Success_OPPONENT_per_minute'] * 50))

    fighter_stats['Knockdown_Resistance_Ratio'] = (
    fighter_stats['Total_KD_OPPONENT_per_minute'] /
     (fighter_stats['Total_HEAD_Success_OPPONENT_per_minute'] +
      fighter_stats['Total_HEAD_Success_OPPONENT_per_minute']
    )* -1
    )

    fighter_stats['Activity_Score'] = (
    fighter_stats['Total_SIG.STR._Attempt_per_minute'] +
    fighter_stats['Total_TOTAL STR._Attempt_per_minute'] +
    fighter_stats['Total_TD_Attempt_per_minute'] +
    fighter_stats['Total_SUB.ATT_per_minute'] +
    fighter_stats['Total_KD_per_minute']
    )

    fighter_stats['Round_Winning_Activities_Metric'] = (10 * fighter_stats['Total_SIG.STR._Attempt_per_minute'] +
                                                        30 * fighter_stats['Total_TD_Success_per_minute'] +
                                                        1 * fighter_stats['Total_CTRL_per_minute'] +
                                                        30 * fighter_stats['Total_KD_per_minute'])
    
    
    weight_classes = ['Welterweight', 'Light Heavyweight', 'Middleweight', 'Lightweight',
                  'Heavyweight', 'Catch Weight', 'Featherweight', 'Bantamweight',
                  'Flyweight', 'Other']
    
    # Standardize scores to 0-100 scale within each division
    for weight_class in weight_classes:
        # Filter the dataframe for the current weight class
        weight_class_df = fighter_stats[fighter_stats['weight_class'] == weight_class]
    
        for col in ['Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense', 'Knockdown_Resistance_Ratio', 'Activity_Score', 'Power', 'Round_Winning_Activities_Metric']:
            min_val = weight_class_df[col].min()
            max_val = weight_class_df[col].max()
        
            # Standardize the scores for the current weight class
            fighter_stats.loc[fighter_stats['weight_class'] == weight_class, col] = (
                (weight_class_df[col] - min_val) / (max_val - min_val) * 100
            )
    

    division_elo = fighter_stats[['FIGHTER', 'Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense', 'Knockdown_Resistance_Ratio', 'Activity_Score', 'Power', 'Round_Winning_Activities_Metric']]

    def mean_top_n(row, n):
        sorted_values = sorted(row, reverse=True)
        return np.mean(sorted_values[:n])
    
    division_elo['mean_top_3'] = division_elo[['Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense']].apply(lambda row: mean_top_n(row, 3), axis=1)
    division_elo['mean_top_2'] = division_elo[['Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense']].apply(lambda row: mean_top_n(row, 2), axis=1)

    division_elo['overall'] = (
        division_elo['Standup_offense'].fillna(0) * 2 +
        division_elo['Standup_defense'].fillna(0) +
        division_elo['Ground_offense'].fillna(0) * 0.6 +
        division_elo['Ground_defense'].fillna(0) * 0.7
    )

    division_elo = division_elo.sort_values('Standup_defense', ascending=False)

    # Create variable names with the actual division name
    globals()[f"{division_name}_elo"] = division_elo
    globals()[f"{division_name}_fighter_stats"] = fighter_stats

    return globals()[f"{division_name}_elo"], globals()[f"{division_name}_fighter_stats"]

In [103]:
combined_recent_3_fights_df.to_csv('recent3.csv')

In [104]:
# Example usage:
all_train_elo, all_train_fighter_stats = process_division_data(combined_previous_fights_df)
all_train_elo2, all_train_fighter_stats2 = process_division_data(combined_prior_fights_df)
all_elo, all_stats = process_division_data(combined_recent_3_fights_df)

all_train_elo = all_train_elo.fillna(1)
all_train_elo2 = all_train_elo.fillna(0)
all_elo = all_elo.fillna(0)

all_elo = all_elo.sort_values('Standup_offense', ascending=False)

all_elo.to_csv('all_elo.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  division_elo['mean_top_3'] = division_elo[['Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense']].apply(lambda row: mean_top_n(row, 3), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  division_elo['mean_top_2'] = division_elo[['Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense']].apply(lambda row: mean_top_n(row, 2), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

In [105]:
import pandas as pd
import numpy as np
import re

# Define the mapping of weight classes to numeric values
weight_class_mapping = {
    'Welterweight': 170,
    'Light Heavyweight': 205,
    'Middleweight': 185,
    'Lightweight': 155,
    'Heavyweight': 265,
    'Catch Weight': 0,  # You may need to adjust this based on how you handle catch weight fights
    'Featherweight': 145,
    'Bantamweight': 135,
    'Flyweight': 125,
    'Other': -1  # Or any other appropriate value for unspecified weight classes
}

def height_to(height):
    if pd.isnull(height) or height == '--':
        return np.nan
    if isinstance(height, (int, float)):
        return height  # Assuming the height is already in inches if it's a number
    if isinstance(height, str):
        match = re.match(r"(\d+)' (\d+)\"", height)
        if match:
            feet = int(match.group(1))
            inches = int(match.group(2))
            return feet * 12 + inches
    return np.nan

def clean_reach(reach):
    if pd.isnull(reach) or reach == '--':
        return np.nan
    elif isinstance(reach, str):
        return float(reach.replace('"', ''))
    else:
        return np.nan

def create_final_dataset(division_elo, division_recent_fights):
    # Extract division name from the input variable name
    division_name = [name for name in globals() if globals()[name] is division_elo][0].split('_')[0]
    
    # Ensure HEIGHT is processed correctly
    division_recent_fights['HEIGHT'] = division_recent_fights['HEIGHT'].apply(height_to)
    
    # Clean and convert REACH to float, replacing non-numeric characters and missing values
    division_recent_fights['REACH'] = division_recent_fights['REACH'].apply(clean_reach)
    
    # Remove rows with missing HEIGHT or REACH
    division_recent_fights = division_recent_fights.dropna(subset=['HEIGHT', 'REACH'])
    
    # Convert weight class to numeric using the mapping
    division_recent_fights['WEIGHT_CLASS_NUMERIC'] = division_recent_fights['weight_class'].map(weight_class_mapping)
    
    # Merge ELO data with recent fights data
    temp = division_elo.merge(
        division_recent_fights[['FIGHTER', 'OPPONENT', 'WIN/LOSS', 'AGE', 'HEIGHT', 'REACH', 'WEIGHT_CLASS_NUMERIC']],
        on='FIGHTER',
        how='left'
    )
    
    # Merge opponent data
    final_data = temp.merge(
        division_elo,
        left_on='OPPONENT',
        right_on='FIGHTER',
        how='left',
        suffixes=('', '_OPPONENT')
    )
    
    # Merge opponent's physical data
    final_data = final_data.merge(
        division_recent_fights[['FIGHTER', 'AGE', 'HEIGHT', 'REACH']],
        left_on='OPPONENT',
        right_on='FIGHTER',
        how='left',
        suffixes=('', '_OPPONENT')
    )
    
    # Drop the extra FIGHTER column
    final_data = final_data.drop(columns=['FIGHTER_OPPONENT'])
    
    # Convert WIN/LOSS to binary
    final_data['WIN/LOSS'] = np.where(final_data['WIN/LOSS'] == 'W', 1,
                                      np.where(final_data['WIN/LOSS'] == 'L', 0, 0))
    
    # Calculate differences for numeric columns
    diff_columns = ['Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense', 'mean_top_2', 'mean_top_3', 'overall', 'Knockdown_Resistance_Ratio', 'Activity_Score', 'Power']
    for col in diff_columns:
        opponent_col = f'{col}_OPPONENT'
        diff_col = f'{col}_DIFF'
        final_data[diff_col] = final_data[col] - final_data[opponent_col]
        
        # Drop the original and opponent columns
        final_data = final_data.drop(columns=[col, opponent_col])
    
    # Add age, height, and reach differences
    #final_data['AGE_DIFF'] = final_data['AGE'] - final_data['AGE_OPPONENT']
    #final_data['HEIGHT_DIFF'] = final_data['HEIGHT'] - final_data['HEIGHT_OPPONENT']
    #final_data['REACH_DIFF'] = final_data['REACH'] - final_data['REACH_OPPONENT']
    
    # Drop the original age, height, and reach columns and specified extra columns
    final_data = final_data.drop(columns=['AGE', 'AGE_OPPONENT', 'HEIGHT', 'HEIGHT_OPPONENT', 'REACH', 'REACH_OPPONENT'])
    
    # Balance the dataset
    wins = final_data[final_data['WIN/LOSS'] == 1]
    losses = final_data[final_data['WIN/LOSS'] == 0]
    
    min_count = min(len(wins), len(losses))
    wins = wins.sample(n=min_count, random_state=42)
    losses = losses.sample(n=min_count, random_state=42)
    
    balanced_data = pd.concat([wins, losses])
    
    # Create mirror matches for the losses
    mirror_matches = losses.copy()
    mirror_matches['WIN/LOSS'] = 1
    mirror_matches['FIGHTER'], mirror_matches['OPPONENT'] = mirror_matches['OPPONENT'], mirror_matches['FIGHTER']
    
    # Multiply the difference columns by -1 for mirror matches
    for col in [col for col in balanced_data.columns if col.endswith('_DIFF')]:
        mirror_matches[col] *= -1
    
    # Combine original balanced data with mirror matches
    final_balanced_data = pd.concat([balanced_data, mirror_matches])
    
    # Shuffle the final dataset
    final_balanced_data = final_balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Create the output variable name
    output_name = f"{division_name}_final_data"
    globals()[output_name] = final_balanced_data
    
    return final_balanced_data


In [106]:
all_final_data1 = create_final_dataset(all_train_elo, combined_recent_fights_df)
all_final_data2 = create_final_dataset(all_train_elo2, combined_second_recent_fights_df)

all_final_data = pd.concat([all_final_data1, all_final_data2], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  division_recent_fights['WEIGHT_CLASS_NUMERIC'] = division_recent_fights['weight_class'].map(weight_class_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  division_recent_fights['WEIGHT_CLASS_NUMERIC'] = division_recent_fights['weight_class'].map(weight_class_mapping)


In [107]:
all_final_data

Unnamed: 0,FIGHTER,Round_Winning_Activities_Metric,OPPONENT,WIN/LOSS,WEIGHT_CLASS_NUMERIC,Round_Winning_Activities_Metric_OPPONENT,Standup_offense_DIFF,Standup_defense_DIFF,Ground_offense_DIFF,Ground_defense_DIFF,mean_top_2_DIFF,mean_top_3_DIFF,overall_DIFF,Knockdown_Resistance_Ratio_DIFF,Activity_Score_DIFF,Power_DIFF
0,mirsad bektic,39.773332,damon jackson,0,145.0,45.940742,-37.800729,-7.622509,-9.273590,-4.679193,-6.150851,-7.191764,-92.063555,-10.061137,-4.000976,11.683348
1,jack jenkins,13.099583,herbert burns,1,145.0,39.504588,75.761350,21.913625,-7.922705,49.060567,38.501105,40.585775,203.025099,23.776296,28.545472,-30.642810
2,alberto mina,76.466824,yoshihiro akiyama,1,170.0,48.269483,26.670874,25.697196,-13.297758,-33.546889,1.736901,6.273727,47.577466,7.824969,-23.153137,37.796470
3,alex pereira,40.151399,jiri prochazka,1,205.0,72.602750,10.073345,34.391285,-28.058181,17.465833,13.769589,20.643488,49.929150,5.277414,-21.185029,7.214640
4,kyoji horiguchi,0.000000,dustin pague,1,135.0,57.800453,9.585029,13.448988,-20.549005,19.578878,14.546697,13.119225,33.994859,22.938815,52.210047,7.042039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3421,hector lombard,36.277297,joshua burkman,0,170.0,30.532246,5.757781,-14.331239,11.612008,37.853353,10.311175,9.759965,30.648875,-2.963692,5.709433,21.195950
3422,nick diaz,59.088476,georges stpierre,1,170.0,,,,,,,,,,,
3423,ryan hall,16.907676,ilia topuria,0,145.0,45.919682,-11.181628,-1.915510,-40.320355,-93.189328,-26.260378,-35.575285,-114.403510,0.000000,-19.831027,4.136248
3424,phil davis,52.053184,vinny magalhaes,1,205.0,74.122396,8.146540,7.103213,14.684431,9.384192,8.243702,10.390612,38.775886,-0.000000,20.052250,-0.000000


In [108]:
all_final_data_og = add_odds_data(all_final_data, all_events_data)

In [109]:
all_final_data_og = all_final_data_og.drop_duplicates(subset=['FIGHTER', 'OPPONENT', 'WIN/LOSS'])

#all_final_data_og = all_final_data_og.drop(columns = ['Round_Winning_Activities_Metric_DIFF', 'Power_DIFF', 'Activity_Score_DIFF', 'FIGHTER_ODDS', 'OPPONENT_ODDS', 'AGE_DIFF', 'HEIGHT_DIFF', 'REACH_DIFF'])

all_final_data_og = all_final_data_og.drop(columns = ['Power_DIFF', 'Activity_Score_DIFF', 'FIGHTER_ODDS', 'OPPONENT_ODDS', 'Round_Winning_Activities_Metric', 'Round_Winning_Activities_Metric_OPPONENT' ])

all_final_data_og = all_final_data_og.dropna()

In [110]:
# Define columns to be negated if the favored fighter is in the 'opponent' column
diff_columns = [
    'Standup_offense_DIFF', 'Standup_defense_DIFF', 'Ground_offense_DIFF', 'Ground_defense_DIFF',
    'mean_top_2_DIFF', 'mean_top_3_DIFF', 'overall_DIFF', 'Knockdown_Resistance_Ratio_DIFF'
]

def rearrange_favored_fighter(row):
    if row['FIGHTER_WIN_PERCENT'] < 50:
        # Create a new Series with swapped and modified values
        new_row = row.copy()
        
        # Swap fighters
        new_row['FIGHTER'], new_row['OPPONENT'] = row['OPPONENT'], row['FIGHTER']
        
        # Negate the difference columns
        for col in diff_columns:
            new_row[col] = -row[col]
        
        # Adjust FIGHTER_WIN_PERCENT
        new_row['FIGHTER_WIN_PERCENT'] = 100 - row['FIGHTER_WIN_PERCENT']
        
        # Adjust WIN/LOSS column if it exists
        if 'WIN/LOSS' in new_row:
            new_row['WIN/LOSS'] = 1 - row['WIN/LOSS']
        
        return new_row
    
    return row

In [111]:
# Define columns to be negated if the underdog is in the 'opponent' column
diff_columns = [
    'Standup_offense_DIFF', 'Standup_defense_DIFF', 'Ground_offense_DIFF', 'Ground_defense_DIFF',
    'mean_top_2_DIFF', 'mean_top_3_DIFF', 'overall_DIFF', 'Knockdown_Resistance_Ratio_DIFF'
]

# Function to rearrange rows such that the underdog is always in the 'fighter' column
def rearrange_underdog_fighter(row):
    underdog_in_opponent = row['FIGHTER_WIN_PERCENT'] > 50
    
    if underdog_in_opponent:
        # Swap fighters
        row['FIGHTER'], row['OPPONENT'] = row['OPPONENT'], row['FIGHTER']
        
        # Negate the difference columns
        for col in diff_columns:
            row[col] = -row[col]
        
        # Adjust WIN/LOSS column
        row['WIN/LOSS'] = 1 - row['WIN/LOSS']
    
    return row

In [183]:
# Apply the rearrangement function to each row
all_final_data_og = all_final_data_og.apply(rearrange_favored_fighter, axis=1)

In [222]:
holder = all_final_data_og

In [238]:
all_final_data_og = all_final_data_og.drop(columns = ['Knockdown_Resistance_Ratio_DIFF', 'WEIGHT_CLASS_NUMERIC'])

In [240]:
all_final_data_og

Unnamed: 0,FIGHTER,OPPONENT,WIN/LOSS,Standup_offense_DIFF,Standup_defense_DIFF,Ground_offense_DIFF,Ground_defense_DIFF,mean_top_2_DIFF,mean_top_3_DIFF,overall_DIFF,FIGHTER_WIN_PERCENT
0,jack jenkins,herbert burns,1,75.761350,21.913625,-7.922705,49.060567,38.501105,40.585775,203.025099,88.888889
1,claudio ribeiro,joseph holmes,1,-2.926474,-60.484819,-14.885406,24.241599,0.850719,-13.056564,-58.299890,62.264151
2,santiago ponzinibbio,muslim salikhov,0,0.695641,-31.248158,-15.555919,-73.809613,-27.207540,-34.388142,-91.557156,61.538462
11,kevin holland,alex oliveira,1,-17.470557,-17.513189,32.381673,15.698019,-8.099657,-6.428576,-22.036685,71.428571
15,brendan allen,sean strickland,0,-1.871064,43.305816,87.398496,2.035188,15.976044,24.449738,93.427418,53.488372
...,...,...,...,...,...,...,...,...,...,...,...
2088,junyong park,joseph holmes,1,4.453095,-37.704242,35.923434,21.287613,3.063510,2.742632,7.657338,70.588235
2090,kevin holland,kyle daukaus,0,2.300574,3.595063,7.199985,-1.989118,0.802972,1.302173,11.123819,62.264151
2091,belal muhammad,dhiego lima,1,21.055840,-5.825719,25.442979,14.279319,4.226800,9.836480,61.547270,71.509972
2092,casey kenney,dominick cruz,0,10.708553,36.991489,17.450552,-20.290685,2.682230,9.136452,54.675447,52.380952


In [197]:
future_fights = pd.read_csv('future_fights.csv')

In [199]:
future_fights

Unnamed: 0,FIGHTER,OPPONENT,ODDS,WEIGHT_CLASS
0,sean omalley,merab dvalishvili,-125,Bantamweight
1,diego lopes,brian ortega,-192,Featherweight
2,daniel zellhuber,esteban ribovics,-238,Lightweight
3,ronaldo rodriguez,ode osbourne,-166,Flyweight
4,manuel torres,ignacio bahamondes,-125,Lightweight
5,joshua van,edgar chairez,-245,Flyweight
6,raul rosas jr,aoriqileng,-1000,Bantamweight
7,benoit saint denis,renato moicano,-278,Lightweight
8,nassourdine imavov,brendan allen,-245,Middleweight
9,joanderson brito,william gomis,-265,Featherweight


In [200]:
def create_final_dataset_future(fight_data, elo_data, fighters_data, reference_date):
    # Create a copy of fighters_data to avoid SettingWithCopyWarning
    fighters_data = fighters_data.copy()

    # Prepare fighters data
    fighters_data.loc[:, 'HEIGHT'] = fighters_data['HEIGHT'].apply(height_to)
    
    fighters_data.loc[:, 'REACH'] = fighters_data['REACH'].astype(str)
    fighters_data.loc[:, 'REACH'] = pd.to_numeric(fighters_data['REACH'].str.replace('"', '').replace('--', np.nan), errors='coerce')
    
    fighters_data.loc[:, 'REACH'] = fighters_data['REACH'].fillna(fighters_data['HEIGHT'])
    
    fighters_data.loc[:, 'DOB_parsed'] = fighters_data['DOB'].apply(parse_date)
    fighters_data = fighters_data.dropna(subset=['DOB_parsed'])
    
    fighters_data.loc[:, 'AGE'] = fighters_data['DOB_parsed'].apply(lambda x: calculate_age(x, reference_date))

    # Merge fighter data
    merged_df = fight_data.merge(
        elo_data,
        left_on='FIGHTER',
        right_on='FIGHTER',
        how='left'
    )
    
    # Merge opponent data
    merged_df = merged_df.merge(
        elo_data,
        left_on='OPPONENT',
        right_on='FIGHTER',
        how='left',
        suffixes=('', '_OPPONENT')
    )
    
    # Merge fighter physical data
    merged_df = merged_df.merge(
        fighters_data[['FIGHTER', 'AGE', 'HEIGHT', 'REACH']],
        left_on='FIGHTER',
        right_on='FIGHTER',
        how='left'
    )
    
    # Merge opponent physical data
    merged_df = merged_df.merge(
        fighters_data[['FIGHTER', 'AGE', 'HEIGHT', 'REACH']],
        left_on='OPPONENT',
        right_on='FIGHTER',
        how='left',
        suffixes=('', '_OPPONENT')
    )
    
    # Remove rows where either fighter or opponent has missing data
    merged_df = merged_df.dropna(subset=['AGE', 'AGE_OPPONENT', 'HEIGHT', 'HEIGHT_OPPONENT', 'REACH', 'REACH_OPPONENT'])
    
    # Calculate differences
    diff_features = [
        'Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense',
        'mean_top_2', 'overall', 'Knockdown_Resistance_Ratio', 'Activity_Score',
        'Power'
    ]
    
    for feature in diff_features:
        merged_df.loc[:, f'{feature}_DIFF'] = merged_df[feature] - merged_df[f'{feature}_OPPONENT']
    
    # Calculate physical differences
    merged_df.loc[:, 'AGE_DIFF'] = merged_df['AGE'] - merged_df['AGE_OPPONENT']
    merged_df.loc[:, 'HEIGHT_DIFF'] = merged_df['HEIGHT'] - merged_df['HEIGHT_OPPONENT']
    merged_df.loc[:, 'REACH_DIFF'] = merged_df['REACH'] - merged_df['REACH_OPPONENT']
    
    # Define the correct order of features
    feature_order = ['FIGHTER', 'OPPONENT'] + [
        'Standup_offense_DIFF', 'Standup_defense_DIFF', 'Ground_offense_DIFF',
        'Ground_defense_DIFF', 'mean_top_2_DIFF', 'overall_DIFF',
        'Knockdown_Resistance_Ratio_DIFF', 'Activity_Score_DIFF', 'Power_DIFF',
        'AGE_DIFF', 'HEIGHT_DIFF', 'REACH_DIFF'
    ]
    
    # Select only the required features in the correct order
    final_df = merged_df[feature_order]
    
    return final_df

In [201]:
import pandas as pd
import numpy as np

# Define the mapping of weight classes to numeric values
weight_class_mapping = {
    'Welterweight': 170,
    'Light Heavyweight': 205,
    'Middleweight': 185,
    'Lightweight': 155,
    'Heavyweight': 265,
    'Catch Weight': 0,  # You may need to adjust this based on how you handle catch weight fights
    'Featherweight': 145,
    'Bantamweight': 135,
    'Flyweight': 125,
    'Other': -1  # Or any other appropriate value for unspecified weight classes
}

def odds_to_probability(odds):
    if odds > 0:
        return 100 / (odds + 100)
    else:
        return -odds / (-odds + 100)

def create_final_dataset_future(fight_data, elo_data):
    # Convert ODDS to FIGHTER_WIN_PERCENT
    fight_data['FIGHTER_WIN_PERCENT'] = fight_data['ODDS'].apply(odds_to_probability)
    
    # Add WEIGHT_CLASS_NUMERIC to the fight_data using the mapping
    fight_data['WEIGHT_CLASS_NUMERIC'] = fight_data['WEIGHT_CLASS'].map(weight_class_mapping)
    
    # Merge fighter data
    merged_df = fight_data.merge(
        elo_data,
        left_on='FIGHTER',
        right_on='FIGHTER',
        how='left'
    )
    
    # Merge opponent data
    merged_df = merged_df.merge(
        elo_data,
        left_on='OPPONENT',
        right_on='FIGHTER',
        how='left',
        suffixes=('', '_OPPONENT')
    )
    
    # Calculate differences
    diff_features = [
        'Standup_offense', 'Standup_defense', 'Ground_offense', 'Ground_defense',
        'mean_top_2', 'mean_top_3', 'overall', 'Knockdown_Resistance_Ratio'
    ]
    
    for feature in diff_features:
        merged_df[f'{feature}_DIFF'] = merged_df[feature] - merged_df[f'{feature}_OPPONENT']
    
    # Define the correct order of features
    feature_order = ['FIGHTER', 'OPPONENT', 'WEIGHT_CLASS_NUMERIC',
                     'Standup_offense_DIFF', 'Standup_defense_DIFF', 
                     'Ground_offense_DIFF', 'Ground_defense_DIFF', 'mean_top_2_DIFF', 
                     'mean_top_3_DIFF', 'overall_DIFF', 'Knockdown_Resistance_Ratio_DIFF', 
                     'FIGHTER_WIN_PERCENT']
    
    # Select only the required features in the correct order
    final_df = merged_df[feature_order]
    
    return final_df


In [202]:
# Example usage:
reference_date = datetime.now()  # Or use a specific date if needed
predict = create_final_dataset_future(future_fights, all_elo)

array([1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1], dtype=int64)

In [208]:
predict = predict.dropna()

In [209]:
predict_holder = predict
predict_holder

Unnamed: 0,FIGHTER,OPPONENT,WEIGHT_CLASS_NUMERIC,Standup_offense_DIFF,Standup_defense_DIFF,Ground_offense_DIFF,Ground_defense_DIFF,mean_top_2_DIFF,mean_top_3_DIFF,overall_DIFF,Knockdown_Resistance_Ratio_DIFF,FIGHTER_WIN_PERCENT
0,sean omalley,merab dvalishvili,135,49.336682,-27.266268,-74.418432,-13.761901,2.29135,-5.616935,17.122706,0.0,0.555556
1,diego lopes,brian ortega,145,22.341474,36.447712,-27.232157,30.764079,33.605895,28.466534,86.32622,1.572688,0.657534
2,daniel zellhuber,esteban ribovics,155,-9.131872,-17.30252,8.096163,8.284546,-0.423663,-6.049949,-24.909384,2.313342,0.704142
3,ronaldo rodriguez,ode osbourne,125,5.398251,-29.053869,46.751055,14.289154,-4.17,5.735986,19.795673,-16.955982,0.62406
4,manuel torres,ignacio bahamondes,155,-9.514254,-12.12428,22.212791,-7.88766,-10.00597,-9.842065,-23.346475,-19.801438,0.555556
5,joshua van,edgar chairez,125,30.527823,-59.737055,0.143628,30.705703,21.984465,2.884947,22.898761,-8.85089,0.710145
6,raul rosas jr,aoriqileng,135,-15.857343,38.54826,37.351913,-80.750969,-17.501351,-0.576676,-27.280957,6.080466,0.909091
7,benoit saint denis,renato moicano,155,0.464519,-7.11388,-12.592487,0.031456,0.247988,-2.205968,-13.718314,5.384505,0.73545
8,nassourdine imavov,brendan allen,185,-2.136208,20.841563,-42.340017,1.868281,-0.133963,3.564619,-7.527067,0.833551,0.710145
9,joanderson brito,william gomis,145,-4.476558,3.03997,28.806993,-1.438678,0.800646,1.554957,10.363975,0.0,0.726027


In [210]:
predict = predict.drop(columns= ['FIGHTER', 'OPPONENT'])

In [211]:
predict

Unnamed: 0,WEIGHT_CLASS_NUMERIC,Standup_offense_DIFF,Standup_defense_DIFF,Ground_offense_DIFF,Ground_defense_DIFF,mean_top_2_DIFF,mean_top_3_DIFF,overall_DIFF,Knockdown_Resistance_Ratio_DIFF,FIGHTER_WIN_PERCENT
0,135,49.336682,-27.266268,-74.418432,-13.761901,2.29135,-5.616935,17.122706,0.0,0.555556
1,145,22.341474,36.447712,-27.232157,30.764079,33.605895,28.466534,86.32622,1.572688,0.657534
2,155,-9.131872,-17.30252,8.096163,8.284546,-0.423663,-6.049949,-24.909384,2.313342,0.704142
3,125,5.398251,-29.053869,46.751055,14.289154,-4.17,5.735986,19.795673,-16.955982,0.62406
4,155,-9.514254,-12.12428,22.212791,-7.88766,-10.00597,-9.842065,-23.346475,-19.801438,0.555556
5,125,30.527823,-59.737055,0.143628,30.705703,21.984465,2.884947,22.898761,-8.85089,0.710145
6,135,-15.857343,38.54826,37.351913,-80.750969,-17.501351,-0.576676,-27.280957,6.080466,0.909091
7,155,0.464519,-7.11388,-12.592487,0.031456,0.247988,-2.205968,-13.718314,5.384505,0.73545
8,185,-2.136208,20.841563,-42.340017,1.868281,-0.133963,3.564619,-7.527067,0.833551,0.710145
9,145,-4.476558,3.03997,28.806993,-1.438678,0.800646,1.554957,10.363975,0.0,0.726027


In [246]:
predict = predict.drop(columns = ['Knockdown_Resistance_Ratio_DIFF', 'WEIGHT_CLASS_NUMERIC'])

In [242]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Define columns
columns = ['WIN/LOSS', 'FIGHTER', 'OPPONENT']

# Separate features (X) and target variable (y)
X = all_final_data_og.drop(columns=columns, axis=1)  # Features
#X[['FIGHTER', 'OPPONENT']] = all_final_data_og[['FIGHTER', 'OPPONENT']]  # Include 'FIGHTER' and 'OPPONENT' in X for encoding
y = all_final_data_og['WIN/LOSS']  # Target variable

# Perform label encoding for 'FIGHTER' and 'OPPONENT' columns
label_encoder = LabelEncoder()

# Combine 'FIGHTER' and 'OPPONENT' columns for consistent encoding
#all_fighters = pd.concat([X['FIGHTER'], X['OPPONENT']], axis=0)
#label_encoder.fit(all_fighters)

# Assign encoded values back to 'FIGHTER' and 'OPPONENT' columns
#X['FIGHTER'] = label_encoder.transform(X['FIGHTER'])
#X['OPPONENT'] = label_encoder.transform(X['OPPONENT'])

# Perform train-test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Check the shapes of the splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (568, 8)
X_test shape: (143, 8)
y_train shape: (568,)
y_test shape: (143,)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Define columns
columns = ['WIN/LOSS', 'FIGHTER', 'OPPONENT']

# Separate features (X) and target variable (y)
X = all_final_data_og.drop(columns=columns, axis=1)  # Features
X[['FIGHTER', 'OPPONENT']] = all_final_data_og[['FIGHTER', 'OPPONENT']]  # Include 'FIGHTER' and 'OPPONENT' in X for encoding
y = all_final_data_og['WIN/LOSS']  # Target variable

# Perform label encoding for 'FIGHTER' and 'OPPONENT' columns
label_encoder = LabelEncoder()

# Combine 'FIGHTER' and 'OPPONENT' columns for consistent encoding
all_fighters = pd.concat([X['FIGHTER'], X['OPPONENT']], axis=0)
label_encoder.fit(all_fighters)

# Assign encoded values back to 'FIGHTER' and 'OPPONENT' columns
X['FIGHTER'] = label_encoder.transform(X['FIGHTER'])
X['OPPONENT'] = label_encoder.transform(X['OPPONENT'])

# Perform train-test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# Apply SMOTE to the training data
smote = SMOTE(random_state=12)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check the shapes of the splits
print("X_train_smote shape:", X_train_smote.shape)
print("y_train_smote shape:", y_train_smote.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


In [None]:
pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Assuming your final dataset is called 'final_balanced_data'
X = all_final_data_og.drop(['FIGHTER', 'OPPONENT', 'WIN/LOSS'], axis=1)
y = all_final_data_og['WIN/LOSS']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

# 2. Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

welterweight_final_data = welterweight_final_data.dropna()

columns= ['WIN/LOSS', 'FIGHTER', 'OPPONENT']

# Separate features (X) and target variable (y)
X = welterweight_final_data.drop(columns = columns, axis=1)  # Features
y = welterweight_final_data['WIN/LOSS']  # Target variable

# all_fighters = pd.concat([X['FIGHTER'], X['OPPONENT']], axis=0)

# Perform label encoding for 'FIGHTER' and 'OPPONENT' columns
# label_encoder = LabelEncoder()
# fighters_encoded = label_encoder.fit_transform(all_fighters)

# Assign encoded values back to 'FIGHTER' and 'OPPONENT' columns
# X['FIGHTER'] = fighters_encoded[:len(X)]  # First half is 'FIGHTER'
# X['OPPONENT'] = fighters_encoded[len(X):]  # Second half is 'OPPONENT'

# Perform train-test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [None]:
X_test

# Mens

In [248]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Assuming you have your X (features) and y (target) already prepared
rf_classifier = RandomForestClassifier(random_state=23, max_depth= None, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 100)

# Train the classifier on the training data
rf_classifier.fit(X, y)

# Predict class (win/lose) on the predict set
y_pred = rf_classifier.predict(predict)

# Predict probabilities on the predict set
y_prob = rf_classifier.predict_proba(predict)[:, 1]

# Create a DataFrame with the results
predictions = pd.DataFrame({
    'FIGHTER': predict_holder['FIGHTER'],
    'OPPONENT': predict_holder['OPPONENT'],
    'RF_PREDICTION': y_pred,
    'WIN_PROBABILITY': y_prob
})

# Remove rows with missing fighter or opponent names
predictions = predictions.dropna(subset=['FIGHTER', 'OPPONENT'])

print(predictions)

               FIGHTER            OPPONENT  RF_PREDICTION  WIN_PROBABILITY
0         sean omalley   merab dvalishvili              1             0.64
1          diego lopes        brian ortega              1             0.69
2     daniel zellhuber    esteban ribovics              0             0.45
3    ronaldo rodriguez        ode osbourne              1             0.81
4        manuel torres  ignacio bahamondes              1             0.58
5           joshua van       edgar chairez              1             0.62
6        raul rosas jr          aoriqileng              1             0.63
7   benoit saint denis      renato moicano              1             0.59
8   nassourdine imavov       brendan allen              1             0.54
9     joanderson brito       william gomis              1             0.57
10        bryan battle       kevin jousset              1             0.67
11          fares ziam        matt frevola              1             0.86
12            oumar sy   

In [213]:
y_pred

array([1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0], dtype=int64)

In [214]:
lr_pred

NameError: name 'lr_pred' is not defined

In [None]:
y_prob

# Womens

In [None]:
rf_classifier = RandomForestClassifier(random_state=23)

# Train the classifier on the training data
rf_classifier.fit(womens_X, womens_y)

lr_model = LogisticRegression(random_state=42)
lr_model.fit(womens_X, womens_y)
womens_lr_pred = lr_model.predict(womens_predict)

# Predict on the test data
womens_y_pred = rf_classifier.predict(womens_predict)

womens_y_prob = rf_classifier.predict_proba(womens_predict)[:, 1]


In [None]:
# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict probabilities on the test data
# This returns probabilities for both classes, we want the probability of class 1
y_prob = rf_classifier.predict_proba(X_test)[:, 1]

# Create a DataFrame with actual values and predicted probabilities
results_df = pd.DataFrame({
    'Actual': y_test,
    'Probability_of_1': y_prob
})


In [None]:
results_df.to_csv('probability.csv')

In [220]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score

# Assuming X is your feature matrix and y is your target variable

# Create the random forest classifier
rf = RandomForestClassifier(random_state=42)

# Create the RFECV object
rfecv = RFECV(estimator=rf, step=1, cv=5, scoring='accuracy')

# Fit RFECV
rfecv.fit(X, y)

# Get selected features
selected_features = X.columns[rfecv.support_]

# Create a new feature matrix with only selected features
X_selected = X[selected_features]

# Train the final model using only selected features
final_rf = RandomForestClassifier(random_state=42)
final_rf.fit(X_selected, y)

# Evaluate the model
accuracy = cross_val_score(final_rf, X_selected, y, cv=5, scoring='accuracy').mean()
print(f"Accuracy with selected features: {accuracy}")

Accuracy with selected features: 0.8185659411011524


In [218]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def select_features(X, y, threshold=0):
    features = list(X.columns)
    selected_features = []
    base_score = 0
    
    for feature in features:
        current_features = selected_features + [feature]
        X_current = X[current_features]
        
        rf = RandomForestClassifier(random_state=42)
        scores = cross_val_score(rf, X_current, y, cv=5, scoring='accuracy')
        avg_score = np.mean(scores)
        
        if avg_score > base_score + threshold:
            selected_features.append(feature)
            base_score = avg_score
            print(f"Added {feature}. New score: {base_score}")
        else:
            print(f"Skipped {feature}. Score did not improve significantly.")
    
    return selected_features

# Assuming X is your feature matrix and y is your target variable

selected_features = select_features(X, y, threshold=0.001)

print("Final selected features:", selected_features)

# Train final model with selected features
X_selected = X[selected_features]
final_rf = RandomForestClassifier(random_state=42)
final_rf.fit(X_selected, y)

# Evaluate final model
final_score = cross_val_score(final_rf, X_selected, y, cv=5, scoring='accuracy').mean()
print(f"Final model accuracy: {final_score}")

Added WEIGHT_CLASS_NUMERIC. New score: 0.6947897173249284
Added Standup_offense_DIFF. New score: 0.7833645228011424
Added Standup_defense_DIFF. New score: 0.8227518959913326
Skipped Ground_offense_DIFF. Score did not improve significantly.
Skipped Ground_defense_DIFF. Score did not improve significantly.
Skipped mean_top_2_DIFF. Score did not improve significantly.
Skipped mean_top_3_DIFF. Score did not improve significantly.
Added overall_DIFF. New score: 0.8255490987885354
Skipped Knockdown_Resistance_Ratio_DIFF. Score did not improve significantly.
Skipped FIGHTER_WIN_PERCENT. Score did not improve significantly.
Final selected features: ['WEIGHT_CLASS_NUMERIC', 'Standup_offense_DIFF', 'Standup_defense_DIFF', 'overall_DIFF']
Final model accuracy: 0.8255490987885354


In [244]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, r2_score
import numpy as np
import pandas as pd

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    class_weight='balanced'
)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_classifier.predict(X_test)

# Get probabilities for the predictions
y_pred_proba = rf_classifier.predict_proba(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2:.2f}')

# Print classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Optional: Print feature importances if needed
print('\nFeature Importances:')
for feature, importance in zip(X_train.columns, rf_classifier.feature_importances_):
    print(f'{feature}: {importance:.4f}')

# Create results DataFrame
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred,
    'Probability': np.max(y_pred_proba, axis=1)  # Get the highest probability
})

# Add a column to show if the prediction was correct
results_df['Correct'] = results_df['Actual'] == results_df['Predicted']

# Filter rows with probability >= 0.6
high_prob_df = results_df[results_df['Probability'] >= 0.6]

# Calculate accuracy for high probability predictions
high_prob_accuracy = accuracy_score(high_prob_df['Actual'], high_prob_df['Predicted'])

print(f'\nAccuracy for predictions with probability >= 0.6: {high_prob_accuracy:.2f}')
print(f'Number of predictions with probability >= 0.6: {len(high_prob_df)}')
print(f'Percentage of predictions with probability >= 0.6: {len(high_prob_df) / len(results_df) * 100:.2f}%')


Accuracy: 0.90
R^2 Score: 0.49

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.74      0.81        42
           1       0.90      0.96      0.93       101

    accuracy                           0.90       143
   macro avg       0.89      0.85      0.87       143
weighted avg       0.89      0.90      0.89       143


Feature Importances:
Standup_offense_DIFF: 0.1158
Standup_defense_DIFF: 0.1194
Ground_offense_DIFF: 0.1654
Ground_defense_DIFF: 0.1082
mean_top_2_DIFF: 0.1116
mean_top_3_DIFF: 0.1425
overall_DIFF: 0.1231
FIGHTER_WIN_PERCENT: 0.1139

Accuracy for predictions with probability >= 0.6: 0.91
Number of predictions with probability >= 0.6: 124
Percentage of predictions with probability >= 0.6: 86.71%


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, r2_score

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42, class_weight='balanced')

# Train the classifier on the training data
rf_classifier.fit(womens_X_train, womens_y_train)

# Predict on the test data
womens_y_pred = rf_classifier.predict(womens_X_test)

# Evaluate the model
accuracy = accuracy_score(womens_y_test, womens_y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate R^2 score
r2 = r2_score(womens_y_test, womens_y_pred)
print(f'R^2 Score: {r2:.2f}')

# Print classification report
print('\nClassification Report:')
print(classification_report(womens_y_test, womens_y_pred))

# Optional: Print feature importances if needed
print('\nFeature Importances:')
for feature, importance in zip(X_train.columns, rf_classifier.feature_importances_):
    print(f'{feature}: {importance:.4f}')


results_df = pd.DataFrame({
    'Actual': womens_y_test,
    'Predicted': womens_y_pred
})

# Add a column to show if the prediction was correct
results_df['Correct'] = results_df['Actual'] == results_df['Predicted']


In [None]:
results_df.to_csv('results.csv')

In [None]:
# Initialize Logistic Regression model
logistic_regression = LogisticRegression(random_state=42)

# Train the model on the training data
logistic_regression.fit(X_train, y_train)

# Predict on the test data
y_pred = logistic_regression.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Optionally, print coefficients and intercept
print(f'\nCoefficients: {logistic_regression.coef_}')
print(f'Intercept: {logistic_regression.intercept_}')

In [None]:
# Define windows for rolling averages
rolling_windows = [3, 2, 1]

# Calculate rolling averages and store in a new DataFrame
rolling_avg_df = df_mens_fights_last_15_years[['FIGHTER', 'OPPONENT', 'WIN/LOSS', 'AGE']].copy()

for window in rolling_windows:
    for col in totals_counting:
        # Shift the 'per_minute' values to exclude the current fight
        shifted_col = df_mens_fights_last_15_years.groupby('FIGHTER')[f'{col}_per_minute'].shift(1)
        
        # Calculate rolling average for the current window size
        rolling_avg = shifted_col.groupby(df_mens_fights_last_15_years['FIGHTER']).rolling(window=window, min_periods=1).mean().reset_index(level=0, drop=True)
        
        # Assign the rolling average to the new column in rolling_avg_df
        rolling_avg_df[f'{col}_PER_MIN_ROLLING_{window}'] = rolling_avg

# Drop duplicate 'FIGHTER' and 'WIN/LOSS' columns if they exist
rolling_avg_df = rolling_avg_df.loc[:, ~rolling_avg_df.columns.duplicated()]

# Display the final DataFrame
rolling_avg_df

In [None]:
import numpy as np
rolling_avg_df['WIN/LOSS'] = np.where(rolling_avg_df['WIN/LOSS'] == 'W', 1,
                                              np.where(rolling_avg_df['WIN/LOSS'] == 'L', 0, 0))

In [None]:
rolling_avg_df = rolling_avg_df.dropna()

In [None]:
rolling_avg_df

In [489]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

columns= ['WIN/LOSS', 'FIGHTER', 'OPPONENT']

# Separate features (X) and target variable (y)
X = rolling_avg_df.drop(columns = columns, axis=1)  # Features
y = rolling_avg_df['WIN/LOSS']  # Target variable

# all_fighters = pd.concat([X['FIGHTER'], X['OPPONENT']], axis=0)

# Perform label encoding for 'FIGHTER' and 'OPPONENT' columns
# label_encoder = LabelEncoder()
# fighters_encoded = label_encoder.fit_transform(all_fighters)

# Assign encoded values back to 'FIGHTER' and 'OPPONENT' columns
# X['FIGHTER'] = fighters_encoded[:len(X)]  # First half is 'FIGHTER'
# X['OPPONENT'] = fighters_encoded[len(X):]  # Second half is 'OPPONENT'

# Perform train-test split with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [491]:
y_train

13801    0
7595     1
8268     0
8679     0
5118     0
        ..
9807     1
9064     0
9373     0
3514     0
12012    1
Name: WIN/LOSS, Length: 7927, dtype: int32

In [495]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, r2_score

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score: {r2:.2f}')

# Print classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

# Optional: Print feature importances if needed
print('\nFeature Importances:')
for feature, importance in zip(X_train.columns, rf_classifier.feature_importances_):
    print(f'{feature}: {importance:.4f}')



Accuracy: 0.54
R^2 Score: -0.85

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.58      0.55       983
           1       0.55      0.50      0.52       999

    accuracy                           0.54      1982
   macro avg       0.54      0.54      0.54      1982
weighted avg       0.54      0.54      0.54      1982


Feature Importances:
AGE: 0.0216
Total_KD_PER_MIN_ROLLING_3: 0.0090
Total_SIG.STR._Success_PER_MIN_ROLLING_3: 0.0168
Total_TOTAL STR._Success_PER_MIN_ROLLING_3: 0.0186
Total_SIG.STR._Attempt_PER_MIN_ROLLING_3: 0.0166
Total_TOTAL STR._Attempt_PER_MIN_ROLLING_3: 0.0170
Total_TD_Success_PER_MIN_ROLLING_3: 0.0132
Total_TD_Attempt_PER_MIN_ROLLING_3: 0.0174
Total_SUB.ATT_PER_MIN_ROLLING_3: 0.0103
Total_REV._PER_MIN_ROLLING_3: 0.0058
Total_CTRL_PER_MIN_ROLLING_3: 0.0201
Total_HEAD_Success_PER_MIN_ROLLING_3: 0.0184
Total_HEAD_Attempt_PER_MIN_ROLLING_3: 0.0174
Total_BODY_Success_PER_MIN_ROLLING_3: 0.0168
Total_BODY_At

In [503]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score