## **First Unit Disband:** Target Fortress (Node 4) vs Random Action

#### Predict winner of game based the following features:
* Player 0 score (target Node 4 "Fortress")
* Player 1 score (Random Action)
* Win Type
* Turn Unit Loss Occurred
* Unit Lost Type
* First player to disband a tank

####Classifiers used:
* Logistic Regression
* KNN
* Random Forest
* SVM
* XG Boost

In [0]:
# import Logistic Regression, KNN, Random forest, and SVM
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# import xgboost (trying another classifier)
import xgboost as xgb

# Data Manipulation
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [5]:
url = 'https://raw.githubusercontent.com/shaunhyp57/LMCO-Everglades-Robot-Behavior-Analytics/master/analytics/ml_models/First%20Unit%20Disband/datasets/firstUnitLost_target4_randact.csv'
data = pd.read_csv(url)

data.head()

Unnamed: 0,numberOfTurns,winType,player_0,player_1,unitLossTurn,unitLostPlayer,unitLostType,combinedStat,winner
0,150.0,1,1354,1443,39.0,1,3,3,1
1,150.0,1,1468,2278,33.0,1,1,1,1
2,150.0,1,1341,1124,22.0,1,1,1,0
3,150.0,1,781,1989,25.0,1,1,1,1
4,150.0,1,557,1544,26.0,1,3,3,1


In [6]:
del data['combinedStat']

# Used to categorize the winner of the game based on the score of the player column
players={'player_0':0, 'player_1':1}

data.head()

Unnamed: 0,numberOfTurns,winType,player_0,player_1,unitLossTurn,unitLostPlayer,unitLostType,winner
0,150.0,1,1354,1443,39.0,1,3,1
1,150.0,1,1468,2278,33.0,1,1,1
2,150.0,1,1341,1124,22.0,1,1,0
3,150.0,1,781,1989,25.0,1,1,1
4,150.0,1,557,1544,26.0,1,3,1


In [7]:
# Total number of matches
n_matches = data.shape[0]

# Calculate number of features. '-1' because one is saved as target
n_features = data.shape[1] - 1

# Calculate matches won by Player 0
n_player0wins = len(data[data.winner == 0])

# Calculate win rate for Player 0
win_rate = (float(n_player0wins) / (n_matches)) * 100

# Print the results
print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))
print("Number of matches won by Player 0: {}".format(n_player0wins))
print("Win rate of Player 0: {:.2f}%".format(win_rate))

Total number of matches: 1000
Number of features: 7
Number of matches won by Player 0: 120
Win rate of Player 0: 12.00%


In [8]:
feature_cols = ['numberOfTurns', 'winType', 'player_0','player_1','unitLossTurn','unitLostPlayer','unitLostType']
X = data[feature_cols]
y = data['winner']

# Standardising the data.
from sklearn.preprocessing import scale

X = pd.DataFrame(scale(X), columns=feature_cols, index=data.index)

X.head()

Unnamed: 0,numberOfTurns,winType,player_0,player_1,unitLossTurn,unitLostPlayer,unitLostType
0,0.088485,-0.100504,0.935204,-0.891782,0.827458,0.501562,2.202128
1,0.088485,-0.100504,1.20986,0.626939,0.178217,0.501562,-0.504852
2,0.088485,-0.100504,0.903883,-1.471988,-1.012059,0.501562,-0.504852
3,0.088485,-0.100504,-0.445308,0.101298,-0.687438,0.501562,-0.504852
4,0.088485,-0.100504,-0.984984,-0.70808,-0.579231,0.501562,2.202128


In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=4, stratify = y)

#svc_model = SVC(kernel='linear')
#svc_model.fit(X_train, y_train)
#y_pred = svc_model.predict(X_test)
#print(metrics.accuracy_score(y_test, y_pred))

In [0]:
#for measuring training time
from time import time 
#for measuring accuracy. Considers both precision and recall to compute score
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))
    
    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label=0), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [11]:
# Initialize the four models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = KNeighborsClassifier(n_neighbors=10)
clf_C = RandomForestClassifier(max_depth = 2, random_state=0)
clf_D = SVC(random_state = 912, kernel='rbf')

# Boosting refers to this general problem of producing a very accurate prediction rule 
# by combining rough and moderately inaccurate rules-of-thumb
clf_E = xgb.XGBClassifier(seed = 2)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_D, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_E, X_train, y_train, X_test, y_test)
print('')

Training a LogisticRegression using a training set size of 800. . .
Trained model in 0.0162 seconds
Made predictions in 0.0009 seconds.
0.9841269841269841 0.99625
F1 score and accuracy score for training set: 0.9841 , 0.9962.
Made predictions in 0.0006 seconds.
F1 score and accuracy score for test set: 0.9787 , 0.9950.

Training a KNeighborsClassifier using a training set size of 800. . .
Trained model in 0.0026 seconds
Made predictions in 0.0256 seconds.
0.9347826086956522 0.985
F1 score and accuracy score for training set: 0.9348 , 0.9850.
Made predictions in 0.0072 seconds.
F1 score and accuracy score for test set: 0.8837 , 0.9750.

Training a RandomForestClassifier using a training set size of 800. . .
Trained model in 0.1177 seconds
Made predictions in 0.0106 seconds.
0.43902439024390244 0.91375
F1 score and accuracy score for training set: 0.4390 , 0.9137.
Made predictions in 0.0079 seconds.
F1 score and accuracy score for test set: 0.4516 , 0.9150.

Training a SVC using a traini

### **Grid Search Cross Validation**

We perform Grid Search CV, which is the process of performing hyper parameter tuning in order to determine the optimal values for a given model

In [0]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def log_reg_gridsearch(X_train, y_train, X_test, y_test, nfolds):
    paramsLogReg = {
        'penalty' : ['l1', 'l2'],
        'C': [0.001,.009,0.01,.09,1,5,10,25],
        'solver' : ['liblinear'],
        'max_iter' : [100, 1000,2500, 5000]
    }

    # Initialize the classifier
    clf = LogisticRegression()

    # Make an f1 scoring function using 'make_scorer'
    f1_scorer = make_scorer(f1_score,pos_label=0)

    # Perform grid search on the classifier using the f1_scorer as the scoring method
    grid_search = GridSearchCV(
                          clf, 
                          scoring = f1_scorer, 
                          param_grid=paramsLogReg,
                          cv = nfolds
                          )
    # Fit the grid search object to the training data and find the optimal parameters
    grid_search = grid_search.fit(X_train,y_train)

    # Get the estimator
    log_reg_best = grid_search.best_estimator_
    #print(log_reg_best)

    # Report the final F1 score for training and testing after parameter tuning
    f1, acc = predict_labels(log_reg_best, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
        
    f1, acc = predict_labels(log_reg_best, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

    return log_reg_best



def knn_gridsearch(X_train, y_train, X_test, y_test, nfolds):
    paramsKNN = { 
        'n_neighbors' : [3,5,11,19],
        'weights' : ['uniform','distance'],
        'metric' : ['euclidean', 'manhattan']
    }

    # Initialize the classifier
    clf = KNeighborsClassifier()

    # Make an f1 scoring function using 'make_scorer'
    f1_scorer = make_scorer(f1_score,pos_label=0)

    # Perform grid search on the classifier using the f1_scorer as the scoring method
    grid_search = GridSearchCV(
                          clf, 
                          scoring = f1_scorer, 
                          param_grid=paramsKNN,
                          cv = nfolds
                          )
    # Fit the grid search object to the training data and find the optimal parameters
    grid_search = grid_search.fit(X_train,y_train)

    # Get the estimator
    knn_best = grid_search.best_estimator_
    #print(knn_best)

    # Report the final F1 score for training and testing after parameter tuning
    f1, acc = predict_labels(knn_best, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
        
    f1, acc = predict_labels(knn_best, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

    return knn_best 


def random_forest_gridsearch(X_train, y_train, X_test, y_test, nfolds):
    # Number of trees in random forest
    # n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    n_estimators = [50, 150, 250]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    paramsRandForest = {
      'n_estimators': [50, 150, 250],
      'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
      'min_samples_split': [2, 4, 6]
    }

    # Initialize the classifier
    clf = RandomForestClassifier(random_state=1)

    # Make an f1 scoring function using 'make_scorer'
    f1_scorer = make_scorer(f1_score,pos_label=0)

    # Perform grid search on the classifier using the f1_scorer as the scoring method
    grid_search = GridSearchCV(
                          clf, 
                          scoring = f1_scorer, 
                          param_grid=paramsRandForest,
                          cv = nfolds
                          )
    # Fit the grid search object to the training data and find the optimal parameters
    grid_search = grid_search.fit(X_train,y_train)

    # Get the estimator
    rand_forest_best = grid_search.best_estimator_
    #print(rand_forest_best)

    # Report the final F1 score for training and testing after parameter tuning
    f1, acc = predict_labels(rand_forest_best, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
        
    f1, acc = predict_labels(rand_forest_best, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

    return rand_forest_best    


def svm_gridsearch(X_train, y_train, X_test, y_test, nfolds):
    paramsSVM = {
      'C': [0.1,1, 10, 100], 
      'gamma': [1,0.1,0.01,0.001],
      'kernel': ['rbf', 'poly', 'sigmoid']
    }

    # Initialize the classifier
    clf = SVC()

    # Make an f1 scoring function using 'make_scorer'
    f1_scorer = make_scorer(f1_score,pos_label=0)

    # Perform grid search on the classifier using the f1_scorer as the scoring method
    grid_search = GridSearchCV(
                          clf, 
                          scoring = f1_scorer, 
                          param_grid=paramsSVM,
                          cv = nfolds
                          )
    # Fit the grid search object to the training data and find the optimal parameters
    grid_search = grid_search.fit(X_train,y_train)

    # Get the estimator
    svm_best = grid_search.best_estimator_
    #print(svm_best)

    # Report the final F1 score for training and testing after parameter tuning
    f1, acc = predict_labels(svm_best, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
        
    f1, acc = predict_labels(svm_best, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

    return svm_best    


def xg_boost_gridsearch(X_train, y_train, X_test, y_test, nfolds):
    """paramsXGB = {
      'learning_rate' : [0.1, 0.01, 0.05],
      'n_estimators' : range(60, 220, 40),
      'max_depth': range (2, 10, 1),
      'min_child_weight': [3],
      'gamma':[0.4],
      'subsample' : [0.8],
      'colsample_bytree' : [0.8],
      'scale_pos_weight' : [1],
      'reg_alpha':[1e-5]
    }"""
    paramsXGB = {
      'n_estimators': [50,100,150,250,300,400,500,1000],
      'max_depth': [5,6,7,8,9,10],
      'max_delta_step': [0,1,2,3,4,5,6,7,8,9,10],
      'min_child_weight': [1,2,3,4,5],
      'subsample': [0.5,0.6,0.7,0.8,0.9,1],
      'colsample_bytree': [0.2,0.3,0.4,0.5,0.6,0.7,0.8],
      'colsample_bylevel': [0.2,0.3,0.4,0.5,0.6,0.7,0.8],
      'learning_rate': [0.002,0.005,0.007,0.008,0.01,0.05,0.07,0.1,0.25,0.5] 
    }

    # Initialize the classifier
    clf = xgb.XGBClassifier(seed=2)

    # Make an f1 scoring function using 'make_scorer'
    f1_scorer = make_scorer(f1_score,pos_label=0)

    # Perform grid search on the classifier using the f1_scorer as the scoring method
    grid_search = GridSearchCV(
                          clf, 
                          scoring = f1_scorer, 
                          param_grid=paramsXGB,
                          cv = nfolds
                          )
    # Fit the grid search object to the training data and find the optimal parameters
    grid_search = grid_search.fit(X_train,y_train)

    # Get the estimator
    xgb_best = grid_search.best_estimator_
    #print(xgb_best)

    # Report the final F1 score for training and testing after parameter tuning
    f1, acc = predict_labels(xgb_best, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
        
    f1, acc = predict_labels(xgb_best, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

    return xgb_best

We perform Grid Search CV on the model that produced the highest accuracy to see if we can get optimized solution with tuned parameters

In [55]:
log_reg_gridsearch(X_train, y_train, X_test, y_test, 10)

Made predictions in 0.0006 seconds.
F1 score and accuracy score for training set: 0.9948 , 0.9988.
Made predictions in 0.0006 seconds.
F1 score and accuracy score for test set: 1.0000 , 1.0000.


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
from sklearn.ensemble import VotingClassifier

nfolds = 10

#create a dictionary of the models
log_reg_best = log_reg_gridsearch(X_train, y_train, X_test, y_test, nfolds)
print('')
knn_best = knn_gridsearch(X_train, y_train, X_test, y_test, nfolds)
print('')
rf_best = random_forest_gridsearch(X_train, y_train, X_test, y_test, nfolds)
print('')
svm_best = svm_gridsearch(X_train, y_train, X_test, y_test, nfolds)
print('')
xgb_best = xg_boost_gridsearch(X_train, y_train, X_test, y_test, nfolds)

LogisticRegression(C=5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Made predictions in 0.0005 seconds.
F1 score and accuracy score for training set: 0.9948 , 0.9988.
Made predictions in 0.0005 seconds.
F1 score and accuracy score for test set: 1.0000 , 1.0000.

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')
Made predictions in 0.0090 seconds.
F1 score and accuracy score for training set: 1.0000 , 1.0000.
Made predictions in 0.0030 seconds.
F1 score and accuracy score for test set: 0.8182 , 0.9600.

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       cr

In [0]:
#create a dictionary of our models
estimators = [('log_reg', log_reg_best),('knn', knn_best), ('rf', rf_best), ('svm', svm_best)]

#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')

In [45]:
#fit model to training data
ensemble.fit(X_train, y_train)
#test our model on the test data
ensemble.score(X_test, y_test)

0.995