# Blue vs Red prediction across time V1
 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#set pandas so that it doesnt sumarize print outputs hiding with "..." in the middle
pd.set_option("display.max_rows", None, "display.max_columns", None) 
# import dataset 
df=pd.read_csv('../input/ufcdata/data.csv')
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.info(verbose=True, null_counts=True)

# General Preprocessing and Cleaning
1. We will get rid of features that are not usefull for the model such as the fighter names ('It is allways Red and Blue'), the location of the fight and the referee
2. Change date to datetime object for chronological sorting

In [None]:
df.drop(columns=['Referee','R_fighter','B_fighter']);
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d');

#Drop fights where result is "Draw" instead of 'Red' or 'Blue'
df=df.loc[df["Winner"].isin(['Blue','Red'])]

#Encode Red=1,Blue=0 and drop names
df['Winner']=(df['Winner']=='Red').astype(int)
df=df.drop(columns=['R_fighter','B_fighter'])

#Encode title bout as int
df['title_bout']=df['title_bout'].astype(float)


# Data split
Split before preprocessing to avoid problems with data leakage. The split is done in a chronological order. 

In [None]:
def split_chrono(X,frac=0.15,test=True,delta=None):
    '''This function splits data from X chronologically in training, validation and testing (if test=True).
    X must contain a datetime column with column name 'date' which will be dropped.
    '''
    #Sort X chronologically 
    X.sort_values(by=['date'], axis=0)
    X=X.drop(columns=['date'])
    #split data between train, validation and test:
    valid_size = int(len(df) * frac)
    
    if(test):
        train = X[:-2 * valid_size]
        valid = X[-2 * valid_size:-valid_size]
        test = X[-valid_size:]
    else:
        train = X[:-valid_size]
        valid = X[-valid_size:]
        
    return train,valid,test
  
        
    

In [None]:
X_train,X_valid,X_test=split_chrono(df,frac=0.2,test=True)
  
#Separate targets from variables
y_train=X_train['Winner']
X_train=X_train.drop(columns=['Winner'])
    
    
y_valid=X_valid['Winner']
X_valid=X_valid.drop(columns=['Winner'])
    
    
y_test=X_test['Winner']
X_test=X_test.drop(columns=['Winner'])

#for cross validation fold metrics
frac=0.2
valid_size = int(len(df) * frac)
X=df[:-valid_size].drop(columns=['Winner'])
Y=df[:-valid_size]['Winner']

print('training:\n',y_train.value_counts())
print('validation:\n',y_valid.value_counts())

# Pipeline
1. Imputing numerical values with mean.
2. Imputing B_Stance and R_Stance with mode.
3. Scaling all numerical values with Simple Scaler
4. Apply label encoder to 

In [None]:
#LABEL ENCODER PROBLEM
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [None]:
#determine numerical and categorical columns
cat_cols=[col for col in X_train.columns if X_train[col].dtype=='object']
shared_cat_cols=[col for col in cat_cols if ((set(X_train[col]))==set(X_valid[col])
                                          and (set(X_train)==set(X_test)))] #shared amongst training,validation and test set


num_cols=[col for col in X_train.columns if X_train[col].dtype in ['int64','float64']]
print ('categorical columns:', cat_cols,'\n\n')
print ('shared categorical columns:', shared_cat_cols,'\n\n')

print('numerical columns:', num_cols)


In [None]:
X_train.columns

In [None]:
#There are no shared categorical columns, drop them to avoid problems with encoder, I'll see how to fix this. 
X_train=X_train.drop(columns=cat_cols)
X_test=X_test.drop(columns=cat_cols)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
numerical_transformer=Pipeline(steps=[
    ('mean_imputer',SimpleImputer(strategy='mean')),
    ('standard_scaler',StandardScaler())
])

categorical_transformer=Pipeline(steps=[
    ('mode_imputer',SimpleImputer(strategy='most_frequent')),
    ('label_encoder',LabelEncoderExt())
])

preprocessor=ColumnTransformer([
    ('num_preprocessing',numerical_transformer,num_cols),
 #   ('cat_preprocessing',categorical_transformer,cat_cols)
    
])

XGB_pipe=Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('model',XGBClassifier())
    
])

RF_pipe=Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('model',RandomForestClassifier(max_depth=15))])
    
LR_pipe=Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('model',LogisticRegression())])





# Training and evaluating models


## 1. XGB

In [None]:
XGB_pipe.fit(X_train,y_train)
pred = model_pipe.predict(X_valid)
from sklearn.metrics import classification_report
print(classification_report(y_valid,pred))
from sklearn.model_selection import cross_val_score

scores = cross_val_score(XGB_pipe, X, Y,
                              cv=5,
                              scoring='f1')

print("MAE scores:\n", scores)

from sklearn.metrics import plot_roc_curve;
from sklearn.metrics import plot_confusion_matrix;
plot_roc_curve(XGB_pipe,X_valid,y_valid);
plot_confusion_matrix(XGB_pipe,X_valid,y_valid);

## 2. Random Forest

In [None]:
RF_pipe.fit(X_train,y_train)
pred = RF_pipe.predict(X_valid)
from sklearn.metrics import classification_report
print(classification_report(y_valid,pred))
from sklearn.model_selection import cross_val_score

scores = cross_val_score(RF_pipe, X, Y,
                              cv=5,
                              scoring='f1')

print("MAE scores:\n", scores)

from sklearn.metrics import plot_roc_curve;
from sklearn.metrics import plot_confusion_matrix;
plot_roc_curve(RF_pipe,X_valid,y_valid);
plot_confusion_matrix(RF_pipe,X_valid,y_valid);

# Logistic Regression

In [None]:
LR_pipe.fit(X_train,y_train)
pred = LR_pipe.predict(X_valid)
from sklearn.metrics import classification_report
print(classification_report(y_valid,pred))
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LR_pipe, X, Y,
                              cv=5,
                              scoring='f1')

print("MAE scores:\n", scores)

from sklearn.metrics import plot_roc_curve;
from sklearn.metrics import plot_confusion_matrix;
plot_roc_curve(LR_pipe,X_valid,y_valid);
plot_confusion_matrix(LR_pipe,X_valid,y_valid);