In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
cdata_all = pd.read_csv("Data/Football-Scenarios-DFE-832307.csv")

In [3]:
cdata = cdata_all[cdata_all['_golden']==False]

 **down | yards to 1st down | field position | quarter | time left on the clock | score differential**

In [4]:
def convert_ordinal(value):
    if value == 'first':
        return 1
    elif value == 'second':
        return 2
    elif value == 'third':
        return 3
    elif value == 'fourth':
        return 4
    else:
        return value
    

In [5]:
def convert_clock(value):
    min = re.search('(.+?) minute', value)
    sec = re.search('(.+?) second', value)
    if min:
        return int(min.group(1)) * 60
    elif sec:
        return int(sec.group(1))
    else:
        return value

In [6]:
def convert_scoredelta(value):
    down_by = re.search('down by (.+?)', value)
    up_by = re.search('up by (.+?)', value)
    
    if down_by:
        return int(down_by.group(1)) * -1
    elif up_by:
        return int(up_by.group(1))
    else:
        return value

In [7]:
def convert_fieldpos(value):
    _list = value.split(' ')
    if len(_list) > 1:
        return int(_list[1])
    elif len(_list) == 1:
        return 100 - int(_list[0])
    else:
        return value

In [8]:
def extract(line):
    down = re.search('It is (.+?) down', line)
    ytg = re.search('down and (.+?). ',line)
    fieldpos = re.search("your (.+?) yardline", line)
    quarter = re.search('the (.+?) quarter', line)
    clock = re.search('There is (.+?) left', line)
    scoredelta = re.search('You are (.+?) points',line)
    
    extraction = (
        line,
        convert_ordinal(down.group(1)) if down else np.NaN,
        int(ytg.group(1).replace('inches','0')) if ytg else np.NaN,
        convert_fieldpos(fieldpos.group(1)) if fieldpos else np.NaN,
        convert_ordinal(quarter.group(1)) if quarter else np.NaN,
        convert_clock(clock.group(1)) if clock else np.NaN,
        convert_scoredelta(scoredelta.group(1)) if scoredelta else np.NaN
    )
    return extraction

In [9]:
data = [
    extract(scenario)
    for scenario in cdata.orig_antecedent
]

In [10]:
df = pd.DataFrame.from_records(data, columns=['scenario','down','ytg','fieldpos','quarter','clock','scoredelta'])


Unnamed: 0,scenario,down,ytg,fieldpos,quarter,clock,scoredelta
0,It is first down and 10. The ball is on your o...,1.0,10.0,20.0,2.0,3.0,-3.0
1,It is second down and inches. The ball is on y...,2.0,0.0,5.0,2.0,3.0,-3.0
2,It is second down and inches. The ball is on y...,2.0,0.0,20.0,2.0,3.0,-3.0
3,It is second down and inches. The ball is on y...,2.0,0.0,5.0,4.0,3.0,-3.0
4,It is second down and inches. The ball is on y...,2.0,0.0,20.0,4.0,3.0,-3.0
5,It is second down and inches. The ball is on y...,2.0,0.0,55.0,4.0,3.0,-7.0
6,It is second down and inches. The ball is on y...,2.0,0.0,80.0,4.0,3.0,-7.0
7,It is second down and 3. The ball is on your o...,2.0,3.0,5.0,4.0,3.0,-3.0
8,It is second down and 3. The ball is on your o...,2.0,3.0,20.0,4.0,3.0,-3.0
9,It is second down and 3. The ball is on your o...,2.0,3.0,20.0,2.0,3.0,-7.0


In [11]:
df.loc[:,'guess'] = cdata.loc[:,'antecedent']

In [12]:
df

Unnamed: 0,scenario,down,ytg,fieldpos,quarter,clock,scoredelta,guess
0,It is first down and 10. The ball is on your o...,1.0,10.0,20.0,2.0,3.0,-3.0,kick a field goal
1,It is second down and inches. The ball is on y...,2.0,0.0,5.0,2.0,3.0,-3.0,kick a field goal
2,It is second down and inches. The ball is on y...,2.0,0.0,20.0,2.0,3.0,-3.0,kick a field goal
3,It is second down and inches. The ball is on y...,2.0,0.0,5.0,4.0,3.0,-3.0,kick a field goal
4,It is second down and inches. The ball is on y...,2.0,0.0,20.0,4.0,3.0,-3.0,kick a field goal
5,It is second down and inches. The ball is on y...,2.0,0.0,55.0,4.0,3.0,-7.0,pass
6,It is second down and inches. The ball is on y...,2.0,0.0,80.0,4.0,3.0,-7.0,pass
7,It is second down and 3. The ball is on your o...,2.0,3.0,5.0,4.0,3.0,-3.0,kick a field goal
8,It is second down and 3. The ball is on your o...,2.0,3.0,20.0,4.0,3.0,-3.0,kick a field goal
9,It is second down and 3. The ball is on your o...,2.0,3.0,20.0,2.0,3.0,-7.0,pass
