In [22]:
#Load games

import nflgame
games = nflgame.games(2014)

In [26]:
# Extract features
import re
from collections import defaultdict
import random
import numpy as np


play_features = []
success_labels = []
yard_labels = []
success_cnt = 0

random.seed()
for p in nflgame.combine_plays(games).limit(5000000):
    features = defaultdict(float)
    success = 0
    yards = 0
    desc = ''
    if (p.note == None or p.note== 'TD' or p.note=='INT') \
    and (' punt' not in p.desc) \
    and ('END ' != p.desc[:4]) \
    and ('End ' != p.desc[:4]) \
    and ('Two-Minute Warning' not in p.desc) \
    and ('spiked the ball to stop the clock' not in p.desc) \
    and ('kneels to ' not in p.desc) \
    and ('Delay of Game' not in p.desc)\
    and (p.time is not None)\
    and ('Penalty on' not in p.desc)\
    and ('Delay of Game' not in p.desc)\
    and ('sacked at' not in p.desc)\
    and ('Punt formation' not in p.desc)\
    and ('Direct snap to' not in p.desc)\
    and ('Aborted' not in p.desc):
        
        features['team'] = p.team            
        if p.drive.game.away == p.team:
            features['opponent'] = p.drive.game.home
        else:
            features['opponent'] = p.drive.game.away        
        timeclock = p.time.clock.split(':')
        features['time'] = float(timeclock[0])*60 + float(timeclock[1])
        features['quarter'] = p.time.qtr            
        features['position'] = 50-p.yardline.offset
        features['down'] = p.down
        features['togo'] = p.yards_togo

        if 'Shotgun' in p.desc:
            features['shotgun'] = 1
        
        sentences = p.desc.split('. ')
        for i in range(len(sentences)):
            if 'reported in as eligible' in sentences[i]:
                continue
                
            if (re.search(r'in at QB$', desc) is not None):
                continue
            
            if ' in at QB' in sentences[i]:
                sentences[i] = re.sub(r"^.+ in at QB", "", sentences[i]).strip()
                
            desc = sentences[i]
            desc = re.sub(r"\(.+?\)", "", desc).strip()

            if ((re.search(r'to \S+$', desc) is not None) or (re.search(r'^\S+$', desc) is not None)) and (i<len(sentences)-1):
                desc = desc + '.' + re.sub(r"\(.+?\)", "", sentences[i+1]).strip()

            if ((i<len(sentences)-1) and (sentences[i+1][:3] == 'to ')):
                desc = desc + '.' + re.sub(r"\(.+?\)", "", sentences[i+1]).strip()

            if (re.search(r'^\S+\.\S+ ', desc) is not None): 
                break
                
                
        if 'incomplete' in desc:
            features['pass'] = 1
            rematch = re.search(r'incomplete \S+ \S+ to ', desc)
            
            if rematch is None:
                # ball just thrown away, no intended target -> ignore
                continue;
                
            match = rematch.group(0).split()            
            features['passlen'] = match[1]
            features['side'] = match[2]
        else:
            if 'no gain' in desc:
                yards = 0
            else:
                if (p.note!='INT') and ('INTERCEPTED' not in desc):         
                    rematch = re.search(r'[-]?[0-9]+ yard\s?', desc) 
                    match = rematch.group(0)         
                    yards = float(match[:match.find(' ')])
            
            if ' pass ' in desc:
                features['pass'] = 1
                match = re.search(r'pass \S+ \S+', desc).group(0).split()
                if match[1] == 'to':
                    continue
                features['passlen'] = match[1]
                features['side'] = match[2]
            else:
                features['pass'] = 0
                if 'up the middle' in desc:
                    features['side'] = 'middle'
                else:               
                    rematch = re.search(r'^\S+ (scrambles )?\S+ \S+', desc) 
                    if rematch is None:
                        print desc
                        print p.desc
                    offset = 0
                    match = rematch.group(0).split()
                    if match[1] == 'scrambles':
                        features['qbrun'] = 1
                        offset = 1
                    
                    features['side'] = match[1+offset] + ' ' + match[2+offset]
                
            if (p.note=='INT') or ('INTERCEPTED' in desc) :
                success = 0
            elif (p.touchdown == True) and (' fumble' not in p.desc):
                success = 1
                success_cnt += 1
            elif (yards >= p.yards_togo): 
                success = 1
                success_cnt += 1
                
        
                        
        play_features.append(features)
        success_labels.append(success)
        yard_labels.append(yards)

    # Debug information
    #if random.randint(0,1000) < 2:
    #    print desc
    #    print p.desc
    #    print features
    #    print 'SUCCESS:',success,'| YARDS:',yards
    #    print "############################################################"
    
success_labels = np.array(success_labels)
print len(play_features)
        

29630


In [36]:
positive = sum(success_labels)
print "Positive:",positive,'/',len(success_labels),"->",(positive / len(success_labels))*100,"%"

Positive: 8840 / 29630 -> 29.8346270672 %


In [27]:
# Encode categorical features
from sklearn.feature_extraction import DictVectorizer
import numpy as np

enc = DictVectorizer()
enc.fit(play_features)  
svm_features = enc.transform(play_features)  


In [97]:
#There are too many negative examples in the dataset

indices = []
for i in range(len(success_labels)):
    label = success_labels[i]
    if (label == 1) or (random.randint(0,100) < 45):
        indices.append(i)
        
YY = success_labels[indices]
XX = svm_features[indices]

print len(YY)
print sum(YY)

18121
8840


In [83]:
#Simple K-Fold cross validation
from sklearn import svm
from sklearn.cross_validation import KFold
from __future__ import division

k = 6

clf = svm.SVC()
cv = KFold(len(YY), k, shuffle=True)

#iterate through the training and test cross validation segments and run the classifier on each one
success_cnt = 0
total_cnt = 0
iteration = 0
for traincv, testcv in cv:
    iteration = iteration+1
    print "Iteration #"+str(iteration)+"...",
    X_train = XX[traincv]
    Y_train = YY[traincv]
    X_test = XX[testcv]
    Y_test = YY[testcv]
    
    Y_pred = clf.fit(X_train, Y_train).predict(X_test)
    
    for i in range(len(Y_pred)):
        total_cnt += 1
        if Y_pred[i] == Y_test[i]:
            success_cnt += 1
    print (success_cnt / total_cnt)*100,"%"
        

print "[Done]"

Iteration #1... 62.6826029216 %
Iteration #2... 61.8027888446 %
Iteration #3... 61.1332447986 %
Iteration #4... 61.5505935088 %
Iteration #5... 61.5553194315 %
Iteration #6... 61.5418672865 %
Total Accuray: 61.5418672865 %


In [96]:
#Example

#Train classifier
from sklearn import svm
clf = svm.SVC()
clf.fit(XX, YY)  

#Predict result for play
features = defaultdict(float)
features['team'] = "GB"
features['opponent'] = "NYG"
features['time'] = 425
features['quarter'] = 4 
features['position'] = 10
features['down'] = 3
features['togo'] = 10
features['shotgun'] = 0
features['pass'] = 1
features['passlen'] = 'short'
features['side'] = 'left'
features['qbrun'] = 0
svm_feats = enc.transform(features)

prediction = clf.predict(svm_feats)
print prediction

KeyboardInterrupt: 