In [1]:
#Load games

import nflgame
games = nflgame.games(2015, week=1)

In [83]:
# Extract features
import re
from collections import defaultdict
import random
import numpy as np


play_features = []
success_labels = []
yard_labels = []

random.seed()
for p in nflgame.combine_plays(games).limit(50000):
    features = defaultdict(float)
    success = 0
    yards = 0
    if (p.note == None or p.note== 'TD' or p.note=='INT') and (' punt' not in p.desc) and ('END ' != p.desc[:4]) and ('End ' != p.desc[:4]) \
    and ('Two-Minute Warning' not in p.desc) and ('spiked the ball to stop the clock' not in p.desc) and ('kneels to ' not in p.desc) \
    and ('Delay of Game' not in p.desc):
        
        features['team'] = p.team    
        timeclock = p.time.clock.split(':')
        features['time'] = float(timeclock[0])*60 + float(timeclock[1])
        features['quarter'] = p.time.qtr            
        features['position'] = 50-p.yardline.offset
        features['down'] = p.down
        features['togo'] = p.yards_togo
        
        if 'sacked at' in p.desc:
            continue
        
        if 'Shotgun' in p.desc:
            features['shotgun'] = 1
                
        if 'incomplete' in p.desc:
            features['pass'] = 1
            rematch = re.search(r'incomplete \S+ \S+ to ', p.desc)
            
            if rematch is None:
                # ball just thrown away, no intended target -> ignore
                continue;
                
            match = rematch.group(0).split()            
            features['passlen'] = match[1]
            features['side'] = match[2]
        else:
            if 'no gain' in p.desc:
                yards = 0
            else:
                if p.note!='INT':         
                    rematch = re.search(r'[-]?[0-9]+ yard\s?', p.desc) 
                    match = rematch.group(0)         
                    yards = float(match[:match.find(' ')])
            
            if ' pass ' in p.desc:
                features['pass'] = 1
                match = re.search(r'pass \S+ \S+', p.desc).group(0).split()
                if match[1] == 'to':
                    continue
                features['passlen'] = match[1]
                features['side'] = match[2]
            else:
                features['pass'] = 0
                if 'up the middle' in p.desc:
                    features['side'] = 'middle'
                else:
                    if 'reported in as eligible' in p.desc:
                        rematch = re.search(r'\S+\.\S+ (scrambles )?\S+ \S+', p.desc[p.desc.find('.  ')+3:])  
                    else:                    
                        rematch = re.search(r'\S+\.\S+ (scrambles )?\S+ \S+', p.desc)                    
                    match = rematch.group(0).split()
                        
                    offset = 0
                    if match[1] == 'scrambles':
                        offset = 1
                        features['qbrun'] = 1
                    
                    features['side'] = match[1+offset] + ' ' + match[2+offset]
                
            if p.note=='INT':
                success = 0
            elif (p.touchdown == True) and (' fumble' not in p.desc):
                success = 1
            elif (yards >= p.yards_togo): 
                success = 1
                
        
                        
        play_features.append(features)
        success_labels.append(success)
        yard_labels.append(yards)

    # Debug information
    #if random.randint(0,1000) < 10:
    #    print p.desc
    #    print features
    #    print "############################################################"
    
success_labels = np.array(success_labels)
print len(play_features)
        

1872


In [84]:
# Encode categorical features
from sklearn.feature_extraction import DictVectorizer
import numpy as np

enc = DictVectorizer()
enc.fit(play_features)  
svm_features = enc.transform(play_features)  


In [85]:
svm_features

<1872x55 sparse matrix of type '<type 'numpy.float64'>'
	with 17204 stored elements in Compressed Sparse Row format>

In [86]:
#Train classifier
from sklearn import svm
clf = svm.SVC()
clf.fit(svm_features, success_labels)  


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [76]:
#Predict result for play
features = defaultdict(float)
features['team'] = "GB"
features['time'] = 425
features['quarter'] = 4 
features['position'] = 10
features['down'] = 3
features['togo'] = 10
features['shotgun'] = 0
features['pass'] = 1
features['passlen'] = 'short'
features['side'] = 'left'
features['qbrun'] = 0
svm_feats = enc.transform(features)

clf.predict(svm_feats)

array([0])

In [88]:
#Simple K-Fold cross validation. 10 folds

from sklearn.cross_validation import KFold
from __future__ import division

cv = KFold(len(success_labels), 10)

#iterate through the training and test cross validation segments and
#run the classifier on each one, aggregating the results into a list
success_cnt = 0
total_cnt = 0
for traincv, testcv in cv:
    X_train = svm_features[traincv]
    Y_train = success_labels[traincv]
    X_test = svm_features[testcv]
    Y_test = success_labels[testcv]
    
    Y_pred = clf.fit(X_train, Y_train).predict(X_test)
    
    for i in range(len(Y_pred)):
        total_cnt += 1
        if Y_pred[i] == Y_test[i]:
            success_cnt += 1
        

#print out the mean of the cross-validated results
print "Accuray:",(success_cnt / total_cnt)*100,"%"

Accuray: 68.2692307692 %
