# Data Mining and Machine Learning
## Project Theme: UFC Winner Prediction

1. Importing packages & Getting data sets
2. Preprocessing data sets
2.1 Get a glance with data sets
2.2 Handling N/A values
2.3 Replace values from each data set
2.3.1 fight_data
2.3.2 fighter_data
2.4 Combine two data sets for new features
3. Choosing features that we should use for prediction
4. Making model algorithms and check Accuracy score
4.1 Bagging
4.1.1 RandomForest
4.2 Boosting
4.2.1 XGBoost
4.2.2 LightGBM
5. Conclusion
5.1 High meaning features

In [None]:
##### https://www.kaggle.com/aqsaqadir22/ufc-data-analysis-training

#Column definitions:
#R_ and B_ prefix signifies red and blue corner fighter stats respectively
#_opp_ containing columns is the average of damage done by the opponent on the fighter
#KD is number of knockdowns
#SIG_STR is no. of significant strikes 'landed of attempted'
#SIG_STR_pct is significant strikes percentage
#TOTAL_STR is total strikes 'landed of attempted'
#TD is no. of takedowns
#TD_pct is takedown percentages
#SUB_ATT is no. of submission attempts
#PASS is no. times the guard was passed?
#REV is the no. of Reversals landed
#HEAD is no. of significant strinks to the head 'landed of attempted'
#BODY is no. of significant strikes to the body 'landed of attempted'
#CLINCH is no. of significant strikes in the clinch 'landed of attempted'
#GROUND is no. of significant strikes on the ground 'landed of attempted'
#win_by is method of win
#last_round is last round of the fight (ex. if it was a KO in 1st, then this will be 1)
#last_round_time is when the fight ended in the last round
#Format is the format of the fight (3 rounds, 5 rounds etc.)
#Referee is the name of the Ref
#date is the date of the fight
#location is the location in which the event took place
#Fight_type is which weight class and whether it's a title bout or not
#Winner is the winner of the fight
#Stance is the stance of the fighter (orthodox, southpaw, etc.)
#Height_cms is the height in centimeter
#Reach_cms is the reach of the fighter (arm span) in centimeter
#Weight_lbs is the weight of the fighter in pounds (lbs)
#age is the age of the fighter
#title_bout Boolean value of whether it is title fight or not
#weight_class is which weight class the fight is in (Bantamweight, heavyweight, Women's flyweight, etc.)
#no_of_rounds is the number of rounds the fight was scheduled for
#current_lose_streak is the count of current concurrent losses of the fighter
#current_win_streak is the count of current concurrent wins of the fighter
#draw is the number of draws in the fighter's ufc career
#wins is the number of wins in the fighter's ufc career
#losses is the number of losses in the fighter's ufc career
#total_rounds_fought is the average of total rounds fought by the fighter
#total_time_fought(seconds) is the count of total time spent fighting in seconds
#total_title_bouts is the total number of title bouts taken part in by the fighter
#win_by_Decision_Majority is the number of wins by majority judges decision in the fighter's ufc career
#win_by_Decision_Split is the number of wins by split judges decision in the fighter's ufc career
#win_by_Decision_Unanimous is the number of wins by unanimous judges decision in the fighter's ufc career
#win_by_KO/TKO is the number of wins by knockout in the fighter's ufc career
#win_by_Submission is the number of wins by submission in the fighter's ufc career
#win_by_TKO_Doctor_Stoppage is the number of wins by doctor stoppage in the fighter's ufc career'''

## 1. Importing packages & Getting data sets

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.tree import export_graphviz
import graphviz
from subprocess import call
from IPython.display import Image
import eli5
from eli5.sklearn import PermutationImportance
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

In [None]:
data_path = "../input/ufcdata/raw_total_fight_data.csv"
fight_data = pd.read_csv(data_path, sep=";")
data2_path = "../input/ufcdata/raw_fighter_details.csv"
fighter_data = pd.read_csv(data2_path)

## 2. Preprocessing data sets
### 2.1 Get a glance with data sets

In [None]:
display(fight_data.head(3))
display(fighter_data.head(3))

In [None]:
fight_data.info()
print("")
fighter_data.info()

In [None]:
display(pandas_profiling.ProfileReport(fight_data))
display(pandas_profiling.ProfileReport(fighter_data))

### 2.2 Handling N/A values

##### features with NA
- fight_data: Referee, Winner => such features are fixed values
- fighter_data: Height, Weight, Reach, Stance, DOB => such features are characterisics of each players

##### It would be better to drop lows with NA than to replace the values to mean or mode values unless we find the right values from the internet and change NAs to them.

In [None]:
fight_data = fight_data.dropna(axis=0)
fighter_data = fighter_data.dropna(axis=0)

### 2.3 Replace values from each data set

#### 2.3.1 fight_data

In [None]:
# Make new column 'WinnerColor' to indicate which corner's player won

f = ['R_fighter', 'B_fighter', 'Winner']
display(fight_data[f].head(3))

fight_data['WinnerColor'] = fight_data.apply(lambda row: 'R' if row.Winner == row.R_fighter else 'B', axis=1)

f.append('WinnerColor')
display(fight_data[f].head(3))

In [None]:
# remove % from 4 features

percent_features = ['R_SIG_STR_pct', 'B_SIG_STR_pct', 'R_TD_pct', 'B_TD_pct']
display(fight_data[percent_features].head(3))

for feature in percent_features:
    fight_data[feature] = fight_data[feature].apply(lambda x: int(x.split("%")[0])/100)
    
display(fight_data[percent_features].head(3))

In [None]:
# make new features (# of landing, # of attempting) from the features which has object values like (# of #)

of_features = ["R_TD", "R_SIG_STR.", "R_TOTAL_STR.", "R_HEAD", "R_BODY", "R_CLINCH", "R_GROUND", "R_LEG", 
        "B_TD", "B_SIG_STR.", "B_TOTAL_STR.", "B_HEAD", "B_BODY", "B_CLINCH", "B_GROUND", "B_LEG"]
display(fight_data[of_features].head(3))

for feature in of_features:
    fight_data[feature+"_land"] = fight_data[feature].apply(lambda x: int(x.split(" of ")[0]))
    fight_data[feature+"_attempt"] = fight_data[feature].apply(lambda x: int(x.split(" of ")[1]))
    
display(fight_data.loc[:2,"R_TD_land":])

In [None]:
# make new feature (total # of landing, total # of attempting for each player)

for feature in of_features:
    if(feature == "R_TD"):
        fight_data["R_tot_land"] = fight_data[feature+"_land"]
        fight_data["R_tot_attempt"] = fight_data[feature+"_attempt"]
    elif(feature == "B_TD"):
        fight_data["B_tot_land"] = fight_data[feature+"_land"]
        fight_data["B_tot_attempt"] = fight_data[feature+"_attempt"]
    else:
        if("R_" in feature):
            fight_data["R_tot_land"] += fight_data[feature+"_land"]
            fight_data["R_tot_attempt"] += fight_data[feature+"_attempt"]
        else:
            fight_data["B_tot_land"] += fight_data[feature+"_land"]
            fight_data["B_tot_attempt"] += fight_data[feature+"_attempt"]
        
features = ["R_tot_land", "R_tot_attempt", "B_tot_land", "B_tot_attempt"]
display(fight_data[features].head(3))

In [None]:
# see values of feature Format and Fight_type

print(fight_data.Format.value_counts(), '\n\n', fight_data.Fight_type.value_counts())

##### It would be better to make new feature for total fight time (min) than use Format.
##### It would be better to change the values of Fight_type. It's too detail.

In [None]:
# make new feature (tot_fight_time_min) from Format, last_round_time, last_round

display(fight_data.loc[:2, 'last_round':'Format'])

Rounds_info = list(fight_data['Format'].unique())
Rounds_info.remove("No Time Limit")
Rounds = {}

for rounds in Rounds_info:
    time_list = rounds.split("(")
    time_list = time_list.pop(1)
    time_list = time_list.split(")")
    time_list = time_list.pop(0)
    time_list = time_list.split("-")
    Rounds[rounds] = [int(t) for t in time_list]

display(Rounds)

fight_data['tot_fight_time_min'] = fight_data.apply(
    lambda x: sum(Rounds[x['Format']][:x['last_round']-1]) +
                  float(x['last_round_time'].split(":")[0]) +
                  (float(x['last_round_time'].split(":")[1])/60) if x['last_round'] != 1 
    else float(x['last_round_time'].split(":")[0]) +
             (float(x['last_round_time'].split(":")[1])/60), axis=1)

feature = ['last_round', 'last_round_time', 'Format', 'tot_fight_time_min']
display(fight_data[feature].head(3))

In [None]:
# change the values of the Fight_type

Fight_types = list(fight_data.Fight_type.unique())
Fight_types_dict = {}

weight_class = ['Fly', 'Bantam', 'Feather', 'Light Heavy', 'Welter', 'Middle', 'Light', 'Heavy', 'Straw', 'Open', 'Catch']
for weight in weight_class:
    fight_data['Fight_type'] = fight_data['Fight_type'].apply(
        lambda x: weight if weight in x else x)

fight_data['Fight_type'] = fight_data['Fight_type'].apply(
    lambda x: x if len(x)<10 else np.nan)

In [None]:
display(fight_data['Fight_type'].value_counts())
print(sum(fight_data['Fight_type'].isna()))

fight_data = fight_data.dropna()
print(sum(fight_data['Fight_type'].isna()))

In [None]:
# change date value by getting year

fight_data['date'] = fight_data['date'].apply(
    lambda x: int(x.split(" ")[2]))

#### 2.3.2 fighter_data

In [None]:
display(fighter_data.head(3))

In [None]:
# change Height values from (inch) to (cm)

fighter_data['Height'] = fighter_data['Height'].apply(
    lambda x: int(x.split("' ")[0])*30.48 + int(x.split(" ")[1].split('"')[0])*2.54)

# change Weight values from (pound) to (kg)

fighter_data['Weight'] = fighter_data['Weight'].apply(
    lambda x: float(x.split(" ")[0])*0.453592)

# change Reach values from (inch) to (cm)

fighter_data['Reach'] = fighter_data['Reach'].apply(
    lambda x: float(x.split('"')[0])*2.54)

# make new feature Born_year from DOB

fighter_data['Born_year'] = fighter_data['DOB'].apply(
    lambda x: int(x.split(" ")[2]))

# change Stance values to numbers
fighter_data['Stance'] = fighter_data['Stance'].apply(
    lambda x: 1 if x=="Orthodox" else (
    2 if x == "Southpaw" else (
    3 if x == "Switch" else 4)))

### 2.4 Combine two data sets for new features

In [None]:
fighter_data['mean_attempt_history'] = 0
fighter_data['mean_land_history'] = 0

for n, fighter in enumerate(list(fighter_data.fighter_name.unique())):
    attempt_history = []
    land_history = []
    for i in range(fight_data.shape[0]):
        if(fighter==fight_data.iloc[i, 0]):
            attempt_history.append(fight_data.iloc[i, 75])
            land_history.append(fight_data.iloc[i, 74])
        elif(fighter==fight_data.iloc[i, 0]):
            attempt_history.append(fight_data.iloc[i, 77])
            land_history.append(fight_data.iloc[i, 76])
        else:
            None
    if(sum(attempt_history)==0):
        fighter_data.iloc[n,7] = 0
    else:
        fighter_data.iloc[n,7] = sum(attempt_history)/len(attempt_history)
    if(sum(land_history)==0):
        fighter_data.iloc[n,8] = 0
    else:
        fighter_data.iloc[n,8] = sum(land_history)/len(land_history)

In [None]:
fighter_data.head(10)

## 3. Choosing features that we should use for prediction

- fight_data: R_fighter, B_fighter, date, WinnerColor(Target)
- fighter_data: Born_year, mean_attempt_history, mean_land_history, Height, Weight, Reach, Stance
- dataset columns: R_fighter, B_fighter, R_age(date-Born_year), B_age(date-Born_year), R_mean_attempt_history, R_mean_land_history, B_mean_attempt_history, B_mean_land_history, R_Height, R_Weight, R_Reach, R_Stance, B_Height, B_Weight, B_Reach, B_Stance, WinnerColor(Target)

In [None]:
fight_features = ['R_fighter', 'B_fighter', 'WinnerColor']
fighter_features = ['R_age', 'B_age', 'R_mean_attempt_history', 'R_mean_land_history', 
            'B_mean_attempt_history', 'B_mean_land_history', 
            'R_Height', 'R_Weight', 'R_Reach', 'R_Stance', 'B_Height', 'B_Weight', 'B_Reach', 'B_Stance']
features = fight_features + fighter_features

final_data = pd.DataFrame(index=range(0,fight_data.shape[0]), columns=[features])
            
for i in range(final_data.shape[0]):
    for j, player in enumerate(list(fighter_data.fighter_name)):
        if fight_data.iloc[i, 0] == player:
            final_data.iloc[i, 0] = player
            final_data.iloc[i, 3] = fight_data.iloc[i, 37] - fighter_data.iloc[j, 6]
            final_data.iloc[i, 5] = fighter_data.iloc[j, 7]
            final_data.iloc[i, 6] = fighter_data.iloc[j, 8]
            final_data.iloc[i, 9] = fighter_data.iloc[j, 1]
            final_data.iloc[i, 10] = fighter_data.iloc[j, 2]
            final_data.iloc[i, 11] = fighter_data.iloc[j, 3]
            final_data.iloc[i, 12] = fighter_data.iloc[j, 4]
        elif fight_data.iloc[i, 1] == player:
            final_data.iloc[i, 1] = player
            final_data.iloc[i, 4] = fight_data.iloc[i, 37] - fighter_data.iloc[j, 6]
            final_data.iloc[i, 7] = fighter_data.iloc[j, 7]
            final_data.iloc[i, 8] = fighter_data.iloc[j, 8]
            final_data.iloc[i, 13] = fighter_data.iloc[j, 1]
            final_data.iloc[i, 14] = fighter_data.iloc[j, 2]
            final_data.iloc[i, 15] = fighter_data.iloc[j, 3]
            final_data.iloc[i, 16] = fighter_data.iloc[j, 4]
        else: 
                continue
    else:
        final_data.iloc[i, 2] = fight_data.iloc[i, 41]

In [None]:
final_data = final_data.dropna()

In [None]:
final_data.head(10)

In [None]:
y = final_data.iloc[:, 2]
X = final_data.drop('WinnerColor', axis='columns')
X = X.drop('R_fighter', axis='columns')
X = X.drop('B_fighter', axis='columns')

In [None]:
X = X.apply(pd.to_numeric)

In [None]:
y = y.apply(lambda x: 0 if x=="R" else 1)

In [None]:
display(y.head(5))
display(X.head(5))

## 4. Making model algorithms and check Accuracy score

### 4.1 Bagging

#### 4.1.1 RandomForest

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)


parameters = [{'criterion':['gini', 'entropy'],
               'max_depth':[3,4,5]}]

kf = KFold(n_splits=3)

grid = GridSearchCV(estimator = RandomForestClassifier(), param_grid = parameters, cv = kf, n_jobs=-1)
grid.fit(x_train, y_train)

model = RandomForestClassifier()

randomforest_model = RandomForestClassifier(n_estimators=100, criterion = grid.best_params_['criterion'], max_depth = grid.best_params_['max_depth'])
randomforest_model.fit(x_train, y_train)
randomforest_score_kfold = randomforest_model.score(x_test, y_test)

print("Best value of each hyperparameters: {}".format(grid.best_params_))
print("Ensemble(random forest) accuracy score: ", round(randomforest_score_kfold*100,2), "%")

plt.figure(figsize=(15,10))
tree.plot_tree(randomforest_model.estimators_[1], filled=True)

## 5. Conclusion

### 5.1 High meaning features

In [None]:
y_pred_quant = randomforest_model.predict_proba(x_test)[:, 1]
y_pred_bin = randomforest_model.predict(x_test)

confusion_matrix = confusion_matrix(y_test, y_pred_bin)
print('confusion_matrix\n', confusion_matrix)

total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant)

fig, ax = plt.subplots()
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

In [None]:
perm = PermutationImportance(randomforest_model, scoring="f1", random_state=1).fit(x_test, y_test)
eli5.show_weights(perm, top = 14, feature_names = x_test.columns.tolist())

In [None]:

perm = PermutationImportance(randomforest_model, random_state=1).fit(x_test, y_test)
eli5.show_weights(perm, top = 50, feature_names = x_test.columns.tolist())

### 4.2 Boosting

#### 4.2.1 XGBoost

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

x_train = pd.DataFrame(x_train, columns = x_train.columns)
y_train = pd.DataFrame(y_train)

parameters = [{'max_depth': [3,4,5,6,7],
               'subsample': [0.4, 0.6, 0.8, 1.0]}]

kf = KFold(n_splits=3)

grid = GridSearchCV(estimator = xgboost.XGBRegressor(), param_grid = parameters, cv = kf, n_jobs=-1)
grid.fit(x_train, y_train)

model = xgboost.XGBRegressor()

xgboost_model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample = grid.best_params_['subsample'],
                           colsample_bytree=1, max_depth = grid.best_params_['max_depth'])
xgboost_model.fit(x_train, y_train)
xgboost_score_kfold = xgboost_model.score(x_test, y_test)

print("Best value of each hyperparameters: {}".format(grid.best_params_))
print("Ensemble(xgb) accuracy score: ", round(xgboost_score_kfold*100,2), "%")

xgboost.to_graphviz(xgboost_model, num_trees=3)