## **Predict winner of game based the following features:**
* Player 0 score
* Player 1 score
* Win Type
* Turn Unit Loss Occurred
* Unit Lost Type
* First player to disband a tank

###**Classifiers used:**
* Logistic Regression
* KNN
* Random Forest
* SVM
* XG Boost

In [0]:
# import Logistic Regression, KNN, Random forest, and SVM
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# import xgboost (trying another classifier)
import xgboost as xgb

# Data Manipulation
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [25]:
url = 'https://raw.githubusercontent.com/shaunhyp57/LMCO-Everglades-Robot-Behavior-Analytics/master/analytics/data/First%20Unit%20Lost/firstUnitLost_randact_allcyc.csv'
data = pd.read_csv(url)

data.head()

Unnamed: 0,numberOfTurns,winType,player_0,player_1,unitLossTurn,unitLostPlayer,unitLostType,combinedStat,winner
0,150.0,1,1188,2452,49.0,1,1,1,1
1,150.0,1,1268,2680,32.0,1,1,1,1
2,150.0,1,1147,2648,37.0,0,1,-1,1
3,150.0,1,1072,1943,31.0,1,2,2,1
4,150.0,1,1268,2446,35.0,0,1,-1,1


In [26]:
del data['combinedStat']

# Used to categorize the winner of the game based on the score of the player column
players={'player_0':0, 'player_1':1}

data.head()

Unnamed: 0,numberOfTurns,winType,player_0,player_1,unitLossTurn,unitLostPlayer,unitLostType,winner
0,150.0,1,1188,2452,49.0,1,1,1
1,150.0,1,1268,2680,32.0,1,1,1
2,150.0,1,1147,2648,37.0,0,1,1
3,150.0,1,1072,1943,31.0,1,2,1
4,150.0,1,1268,2446,35.0,0,1,1


In [27]:
# Total number of matches
n_matches = data.shape[0]

# Calculate number of features. '-1' because one is saved as target
n_features = data.shape[1] - 1

# Calculate matches won by Player 0
n_player0wins = len(data[data.winner == 0])

# Calculate win rate for Player 0
win_rate = (float(n_player0wins) / (n_matches)) * 100

# Print the results
print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))
print("Number of matches won by Player 0: {}".format(n_player0wins))
print("Win rate of Player 0: {:.2f}%".format(win_rate))

Total number of matches: 493
Number of features: 7
Number of matches won by Player 0: 30
Win rate of Player 0: 6.09%


In [28]:
feature_cols = ['numberOfTurns', 'winType', 'player_0','player_1','unitLossTurn','unitLostPlayer','unitLostType']
X = data[feature_cols]
y = data['winner']

# Standardising the data.
from sklearn.preprocessing import scale

X = pd.DataFrame(scale(X), columns=feature_cols, index=data.index)

X.head()

Unnamed: 0,numberOfTurns,winType,player_0,player_1,unitLossTurn,unitLostPlayer,unitLostType
0,0.198517,-0.216123,-0.184907,-0.130899,2.525999,0.861423,-0.475624
1,0.198517,-0.216123,0.000296,0.490975,0.453518,0.861423,-0.475624
2,0.198517,-0.216123,-0.279824,0.403694,1.063071,-1.16087,-0.475624
3,0.198517,-0.216123,-0.453452,-1.519203,0.331607,0.861423,0.98079
4,0.198517,-0.216123,0.000296,-0.147264,0.81925,-1.16087,-0.475624


In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=4, stratify = y)

#svc_model = SVC(kernel='linear')
#svc_model.fit(X_train, y_train)
#y_pred = svc_model.predict(X_test)
#print(metrics.accuracy_score(y_test, y_pred))

In [0]:
#for measuring training time
from time import time 
#for measuring accuracy. Considers both precision and recall to compute score
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))
    
    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label=0), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [31]:
# Initialize the four models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = KNeighborsClassifier(n_neighbors=10)
clf_C = RandomForestClassifier (max_depth = 2, random_state=0)
clf_D = SVC(random_state = 912, kernel='rbf')

# Boosting refers to this general problem of producing a very accurate prediction rule 
# by combining rough and moderately inaccurate rules-of-thumb
clf_E = xgb.XGBClassifier(seed = 82)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_D, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_E, X_train, y_train, X_test, y_test)
print('')

Training a LogisticRegression using a training set size of 394. . .
Trained model in 0.0115 seconds
Made predictions in 0.0009 seconds.
0.9787234042553191 0.9974619289340102
F1 score and accuracy score for training set: 0.9787 , 0.9975.
Made predictions in 0.0008 seconds.
F1 score and accuracy score for test set: 0.9091 , 0.9899.

Training a KNeighborsClassifier using a training set size of 394. . .
Trained model in 0.0019 seconds
Made predictions in 0.0195 seconds.
0.8292682926829268 0.9822335025380711
F1 score and accuracy score for training set: 0.8293 , 0.9822.
Made predictions in 0.0063 seconds.
F1 score and accuracy score for test set: 0.9091 , 0.9899.

Training a RandomForestClassifier using a training set size of 394. . .
Trained model in 0.1501 seconds
Made predictions in 0.0155 seconds.
1.0 1.0
F1 score and accuracy score for training set: 1.0000 , 1.0000.
Made predictions in 0.0125 seconds.
F1 score and accuracy score for test set: 0.9091 , 0.9899.

Training a SVC using a tr

## **Grid Search Cross Validation**

In [32]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

params = {'learning_rate' : [0.1],
          'n_estimators' : [40],
          'max_depth': [3],
          'min_child_weight': [3],
          'gamma':[0.4],
          'subsample' : [0.8],
          'colsample_bytree' : [0.8],
          'scale_pos_weight' : [1],
          'reg_alpha':[1e-5]
        } 

# Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label=0)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=params,
                        cv=5)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
print(clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.4,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=3, missing=None, n_estimators=40, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=1, seed=2,
              silent=None, subsample=0.8, verbosity=1)
Made predictions in 0.0012 seconds.
F1 score and accuracy score for training set: 1.0000 , 1.0000.
Made predictions in 0.0010 seconds.
F1 score and accuracy score for test set: 0.9091 , 0.9899.
