# Machine Learning Models and Ensemble Method
---


In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # disable GPU
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

SEED = 111 # constant seed for reproducibility
#os.environ['PYTHONHASHSEED'] = str(SEED)
#np.random.seed(SEED)
#tf.random.set_seed(SEED)


## Train/Test Split:

In [2]:
df = pd.read_csv("UFC_TRAIN.csv")

# tackling imbalance issue
theMin = df["Winner"].value_counts().min()
minority = df[df["Winner"]==1].iloc[0:theMin]
undersampleMaj = df[df["Winner"]==0].iloc[0:theMin]
df = pd.concat([minority, undersampleMaj], axis=0)
df["Winner"].value_counts()

# train/test split
X = df.drop(["Winner","B_fighter","R_fighter"], axis=1).values
y = df["Winner"].values
X_TRAIN, X_TEST, y_TRAIN, y_TEST = train_test_split(X,y, test_size=0.1, random_state=SEED)



#### Baseline: Always predict red (i.e: 0)

In [3]:
metrics.accuracy_score(np.zeros(len(df.index)),df["Winner"])

0.5

67.96 % Baseline

## ML Models
### 1- DNN:

In [None]:
# scaling
scaler = MinMaxScaler()
scaler.fit(X_TRAIN)
X_train = scaler.transform(X_TRAIN)
X_test = scaler.transform(X_TEST)
y_train = y_TRAIN
y_test = y_TEST
print(f"X_train shape: {X_train.shape} | X_test shape: {X_test.shape} | y_train shape: {y_train.shape} | y_test shape: {y_test.shape}")

# model
dnnClf = Sequential()

# input layer
dnnClf.add(Dense(units=42, activation='relu'))
dnnClf.add(Dropout(0.25)) # deactivates 50% of nodes

dnnClf.add(Dense(units=42, activation='relu'))
dnnClf.add(Dropout(0.25)) # deactivates 50% of nodes

dnnClf.add(Dense(units=42, activation='relu'))
dnnClf.add(Dropout(0.25)) # deactivates 50% of nodes

dnnClf.add(Dense(units=42, activation='relu'))

dnnClf.add(Dense(units=42, activation='relu'))

# output layer
dnnClf.add(Dense(units=1, activation='sigmoid'))

dnnClf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

from tensorflow.keras.callbacks import EarlyStopping # prevent diverge of loss & val_loss
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=16)

In [None]:
dnnClf.fit(x=X_train, 
          y=y_train, 
          epochs=400,
          validation_data=(X_test, y_test), verbose=1,
          callbacks=[early_stop]
          )
model_loss = pd.DataFrame(dnnClf.history.history)
model_loss.plot()

In [None]:
dnnPreds = dnnClf.predict(scaler.transform(X_TEST))
dnnPreds = [round(i[0]) for i in dnnPreds]
dnnAcc = metrics.accuracy_score(dnnPreds, y_TEST)
print(dnnAcc)

### 2- SVM:

In [None]:
svm_param = {"kernel":("linear","poly","rbf", "sigmoid"),
            "C":[1,52,10],
            "degree":[3,8],
            "gamma":("auto","scale"),
            "coef0":[0.001,10,0.5]}
svmClf = SVC()
svmGrid = GridSearchCV(svmClf, svm_param,cv=2)
svmGrid.fit(X_TRAIN, y_TRAIN)
#svmPreds = svmClf.predict(X_TEST)
#svmAcc = metrics.accuracy_score(svmPreds, y_TEST)
#print(svmAcc)

### 3- RF:

In [None]:
rfClf = RandomForestRegressor(n_estimators = 2000, random_state = SEED)
rfClf.fit(X_TRAIN, y_TRAIN)
rfPreds = rfClf.predict(X_TEST)
rfPreds = [round(i) for i in rfPreds]
rfAcc = metrics.accuracy_score(rfPreds,y_TEST)
print(rfAcc)


### 4- Ensemble Method:

In [None]:
def ensemble(sample):
    ensPred = []
    dnnPred = dnnClf.predict(scaler.transform(sample.reshape(1,-1))).tolist()[0][0]
    svmPred = svmClf.predict(sample.reshape(1,-1)).tolist()[0]
    rfPred = rfClf.predict(sample.reshape(1,-1)).tolist()[0]
    
    ensPred.append([dnnPred,svmPred,rfPred])
    ensPred = [round(i) for i in ensPred[0]]
    return(max(set(ensPred), key=ensPred.count))

final_preds = []
for i,e in enumerate(X_TEST):
    final_preds.append(ensemble(e))
    
metrics.accuracy_score(final_preds,y_TEST)