In [0]:
# import Logistic Regression, KNN, Random forest, and SVM
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# import xgboost (trying another classifier)
import xgboost as xgb

# Data Manipulation
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [117]:
url = 'https://raw.githubusercontent.com/shaunhyp57/LMCO-Everglades-Robot-Behavior-Analytics/master/analytics/data/first_tank_disband.csv?token=AIPEXHCRNII6OJVBSCDCRMS6N6724'
cols = ['Game_ID','Player_0','Player_1','Player_1st_Tank_Disband','Turn_Count','Win_State']
data = pd.read_csv(url, names=cols, header=None)

del data['Game_ID']

players={'Player_0':0, 'Player_1':1}

def returncolname(row, colnames):
  return colnames[np.argmax(row.values)]

data['Winner'] = data.apply(lambda x: players.get(returncolname(x, data.columns),1), axis=1)

data.head()

Unnamed: 0,Player_0,Player_1,Player_1st_Tank_Disband,Turn_Count,Win_State,Winner
0,917,1088,1,150,1,1
1,1540,1749,1,150,1,1
2,1371,808,1,150,1,0
3,853,631,1,150,1,0
4,1279,1085,1,150,1,0


In [118]:
# Total number of matches
n_matches = data.shape[0]

# Calculate number of features. '-1' because one is saved as target
n_features = data.shape[1] - 1

# Calculate matches won by Player 0
n_player0wins = len(data[data.Winner == 0])

# Calculate win rate for Player 0
win_rate = (float(n_player0wins) / (n_matches)) * 100

# Print the results
print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))
print("Number of matches won by Player 0: {}".format(n_player0wins))
print("Win rate of home team: {:.2f}%".format(win_rate))

#data.tail(30)

Total number of matches: 5823
Number of features: 5
Number of matches won by Player 0: 2900
Win rate of home team: 49.80%


In [0]:
#from pandas.plotting import scatter_matrix

#scatter_matrix(data[['Player_1st_Tank_Disband','Player_0','Player_1']], figsize=(10,10));

In [0]:
#feature_cols = ['Player_1st_Tank_Disband', 'Player_0', 'Player_1']
#X = data[feature_cols]

#action = np.zeros((7,2))
#a = np.array([0,1])
#action = np.tile(a, (7, 1))
#action = [[[0,1]] for i in range(7)]
#action.shape

In [119]:
feature_cols = ['Player_1st_Tank_Disband', 'Player_0', 'Player_1']
X = data[feature_cols]
y = data['Winner']

# Standardising the data.
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.
#cols = [['Player_0','Player_1']]
#for col in cols:
#X = scale(X)
X = pd.DataFrame(scale(X), columns=feature_cols, index=data.index)

X.head()

Unnamed: 0,Player_1st_Tank_Disband,Player_0,Player_1
0,1.002235,-0.678536,-0.334157
1,1.002235,0.577474,0.982629
2,1.002235,0.236759,-0.891948
3,1.002235,-0.807565,-1.244551
4,1.002235,0.05128,-0.340133


In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=4, stratify = y)

#svc_model = SVC(kernel='linear')
#svc_model.fit(X_train, y_train)
#y_pred = svc_model.predict(X_test)
#print(metrics.accuracy_score(y_test, y_pred))

In [0]:
#for measuring training time
from time import time 
#for measuring accuracy. Considers both precision and recall to compute score
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    
    end = time()
    # Print and return results
    print("Made predictions in {:.4f} seconds.".format(end - start))
    
    return f1_score(target, y_pred, pos_label=0), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f1, acc)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [122]:
# Initialize the three models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = KNeighborsClassifier(n_neighbors=10)
clf_C = RandomForestClassifier (max_depth = 2, random_state=0)
clf_D = SVC(random_state = 912, kernel='rbf')
#Boosting refers to this general problem of producing a very accurate prediction rule 
#by combining rough and moderately inaccurate rules-of-thumb
clf_E = xgb.XGBClassifier(seed = 82)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_D, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_E, X_train, y_train, X_test, y_test)
print('')

Training a LogisticRegression using a training set size of 4658. . .
Trained model in 0.0232 seconds
Made predictions in 0.0016 seconds.
0.9993530299762777 0.9993559467582653
F1 score and accuracy score for training set: 0.9994 , 0.9994.
Made predictions in 0.0015 seconds.
F1 score and accuracy score for test set: 1.0000 , 1.0000.

Training a KNeighborsClassifier using a training set size of 4658. . .
Trained model in 0.0063 seconds
Made predictions in 0.2230 seconds.
0.9948409286328461 0.9948475740661228
F1 score and accuracy score for training set: 0.9948 , 0.9948.
Made predictions in 0.0557 seconds.
F1 score and accuracy score for test set: 0.9966 , 0.9966.

Training a RandomForestClassifier using a training set size of 4658. . .
Trained model in 0.2538 seconds
Made predictions in 0.0370 seconds.
0.938950988822012 0.9390296264491198
F1 score and accuracy score for training set: 0.9390 , 0.9390.
Made predictions in 0.0191 seconds.
F1 score and accuracy score for test set: 0.9338 , 0.

In [0]:
#svc_model = SVC(kernel='linear')
#scores = cross_val_score(svc_model, X, y, cv=10, scoring='accuracy')
#print(scores)

In [115]:
# Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Create the parameters list you wish to tune
#parameters = { 'learning_rate' : [0.1],
#               'n_estimators' : [40],
#               'max_depth': [3],
#               'min_child_weight': [3],
#               'gamma':[0.4],
#               'subsample' : [0.8],
#               'colsample_bytree' : [0.8],
#               'scale_pos_weight' : [1],
#               'reg_alpha':[1e-5]
#             } 
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        } 

# Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label=0)

# Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
print(clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.4,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=3, missing=None, n_estimators=40, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=1, seed=2,
              silent=None, subsample=0.8, verbosity=1)
Made predictions in 0.0063 seconds.
F1 score and accuracy score for training set: 0.9879 , 0.9880.
Made predictions in 0.0022 seconds.
F1 score and accuracy score for test set: 0.9801 , 0.9803.
