### Import all the libraries and Estimators 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from xgboost import XGBClassifier

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

        
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import accuracy_score


### Objective : Predict if Fighter_Red won or not.

### 1.Read the input files 

In [None]:
df = pd.read_csv("/kaggle/input/ultimate-ufc-dataset/ufc-master.csv")  # The train dataset 
df_pred = pd.read_csv("/kaggle/input/ultimate-ufc-dataset/upcoming-event.csv") # The pred dataset 

#### Assign a column to show which records belong to the pred dataset / train dataset

In [None]:
df['train_data'] = 1 
df_pred['train_data'] = 0 
df = pd.concat([df,df_pred])

In [None]:
df.sample(5)

### Function to create a column which will write the outcome as 1 / 0 


In [None]:
def get_winner(color) : return 1 if color =='Red' else 0

> This will create another column with 1 or 0 

In [None]:
df['won'] = df.Winner.apply(lambda x: get_winner(x))

### 2.Select the features to be used for the model
>  You can use your own features or select the ones from the original dataset

In [None]:
#Feature selection
features = ['R_odds','B_odds','R_ev','B_ev','title_bout','weight_class','no_of_rounds',\
           'B_current_lose_streak','B_current_win_streak',\
            'B_wins','B_losses',\
            'B_age','B_Stance','B_Height_cms','B_Reach_cms','B_Weight_lbs',\

            'R_wins','R_losses',\
            'R_current_lose_streak','R_current_win_streak',\
           'R_age','R_Stance','R_Height_cms','R_Reach_cms','R_Weight_lbs'
           ]

selected_columns = features + ["train_data" , "won"]

### 3. Dummify the categorical columns 

In [None]:
df=pd.get_dummies(df[selected_columns])

### 4.Separate into X ( independent var - dataset )  and y ( dependent var ) datasets
> Split the dataset back to train and pred datasets.

In [None]:
X = df[df['train_data'] == 1]
y = df['won'][df['train_data'] == 1]

X_pred = df[df['train_data'] == 0 ]

> Drop the y-var column from test set

In [None]:
X = X.drop(['train_data','won'],axis = 1 )
X_pred = X_pred.drop(['train_data','won'],axis = 1 )

### 5.Do a Grid search with Cross-validation (5 fold)
> This might take a long time based on the number of folds , parameters you chose and the CPU you have

In [None]:
from sklearn.model_selection import GridSearchCV  
param_list = {
 'max_depth':range(1,6),
 'min_child_weight':range(1,2),
 'n_estimators': range(10,100,40)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.001, max_depth=5,\
                                                  min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,\
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),\
                                    param_grid = param_list, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1 = gsearch1.fit(X,y)


In [None]:
from sklearn.model_selection import GridSearchCV  
param_test1 = {
 'max_depth':range(1,6),
 'min_child_weight':range(1,2),
 'n_estimators': range(10,100,40)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.001, max_depth=5,\
                                                  min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,\
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),\
                                    param_grid = param_test1, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch1 = gsearch1.fit(X,y)

Display the best estimator hyper-parameters & the best score ( from the result of various parameters in Grid Search ) 

In [None]:
print(f"Best score = {gsearch1.best_score_}")
print(f"Best parameters = {gsearch1.best_params_ }")

### Final model

In [None]:
final_xgb_model = XGBClassifier(learning_rate =0.001, max_depth=5,min_child_weight=1, gamma=0, subsample=0.8,\
                                colsample_bytree=0.8,objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27 )
final_xgb_model= final_xgb_model.set_params(**gsearch1.best_params_)
final_xgb_model.fit(X ,y)

### Create a feature importance graph to find which are the most important features 

In [None]:
import matplotlib.pyplot as plt 
from xgboost import plot_importance
plot_importance(final_xgb_model)
plt.show();

### Create the prediction - files Probability Prediction
> If you want to create a probability of predictions 

In [None]:
final_prediction_proba =  pd.DataFrame(final_xgb_model.predict_proba(X_pred))
final_prediction_proba.columns = ['pred_0','pred_1']

predict_proba = pd.DataFrame({"R_fighter": df_pred['R_fighter'].values,"B_fighter":  df_pred['B_fighter'].values, "R_prob": final_prediction_proba['pred_1'].values,"B_prob": final_prediction_proba['pred_0'].values})
predict_proba.head(10)
#predict_proba.to_csv('predict_probab.csv',index=False)

### Create the prediction files  - Absolute predictions 

In [None]:
final_prediction_abs =  pd.DataFrame({"pred":final_xgb_model.predict(X_pred)})

predict_absolute = pd.DataFrame({"R_fighter": df_pred['R_fighter'].values,"B_fighter":  df_pred['B_fighter'].values, "R_prob": final_prediction_abs['pred'].values,"B_prob": 1-final_prediction_abs['pred'].values})
predict_absolute.head(5)
predict_absolute.to_csv('predict_absolute.csv',index=False)