In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
cdata_all = pd.read_csv("Data/Football-Scenarios-DFE-832307.csv")

In [3]:
cdata_all.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,antecedent,antecedent:confidence,orig_antecedent,antecedent_gold,option1,option2,option3,option4,option5
0,831005673,False,finalized,5,11/20/15 20:20,kick a field goal,0.8092,It is first down and 10. The ball is on your o...,,punt,kick a field goal,run,pass,kneel down
1,831005674,False,finalized,5,11/18/15 21:59,kick a field goal,1.0,It is second down and inches. The ball is on y...,,punt,kick a field goal,run,pass,kneel down
2,831005675,False,finalized,5,11/20/15 22:43,kick a field goal,0.6211,It is second down and inches. The ball is on y...,,punt,kick a field goal,run,pass,kneel down
3,831005676,False,finalized,5,11/19/15 7:41,kick a field goal,0.8073,It is second down and inches. The ball is on y...,,punt,kick a field goal,run,pass,kneel down
4,831005677,False,finalized,5,11/21/15 8:01,kick a field goal,1.0,It is second down and inches. The ball is on y...,,punt,kick a field goal,run,pass,kneel down


In [4]:
len(cdata_all)

3730

In [5]:
cdata = cdata_all[cdata_all['_golden']==False]

In [6]:
len(cdata)

3706

In [7]:
cdata.orig_antecedent.str.split('.').head()


0    [It is first down and 10,  The ball is on your...
1    [It is second down and inches,  The ball is on...
2    [It is second down and inches,  The ball is on...
3    [It is second down and inches,  The ball is on...
4    [It is second down and inches,  The ball is on...
Name: orig_antecedent, dtype: object

In [8]:
split_scenarios = cdata.orig_antecedent.str.split('.').tolist()
split_scenarios = [scenario[0:-1] for scenario in split_scenarios]

 **down | yards to 1st down | field position | quarter | time left on the clock | score differential**

In [9]:
def convert_ordinal(value):
    if value == 'first':
        return 1
    elif value == 'second':
        return 2
    elif value == 'third':
        return 3
    elif value == 'fourth':
        return 4
    else:
        return value
    

In [10]:
def convert_clock(value):
    min = re.search('(.+?) minute', value)
    sec = re.search('(.+?) second', value)
    if min:
        return int(min.group(1)) * 60
    elif sec:
        return int(sec.group(1))
    else:
        return value

In [11]:
def convert_scoredelta(value):
    down_by = re.search('down by (.+?)', value)
    up_by = re.search('up by (.+?)', value)
    
    if down_by:
        return int(down_by.group(1)) * -1
    elif up_by:
        return int(up_by.group(1))
    else:
        return value

In [12]:
def convert_fieldpos(value):
    _list = value.split(' ')
    if len(_list) > 1:
        return int(_list[1])
    elif len(_list) == 1:
        return 100 - int(_list[0])
    else:
        return value

In [13]:
def extract(line):
    down = re.search('It is (.+?) down', line)
    ytg = re.search('down and (.+?). ',line)
    fieldpos = re.search("your (.+?) yardline", line)
    quarter = re.search('the (.+?) quarter', line)
    clock = re.search('There is (.+?) left', line)
    scoredelta = re.search('You are (.+?) points',line)
    
    extraction = (
        line,
        convert_ordinal(down.group(1)) if down else np.NaN,
        int(ytg.group(1).replace('inches','0')) if ytg else np.NaN,
        convert_fieldpos(fieldpos.group(1)) if fieldpos else np.NaN,
        convert_ordinal(quarter.group(1)) if quarter else np.NaN,
        convert_clock(clock.group(1)) if clock else np.NaN,
        convert_scoredelta(scoredelta.group(1)) if scoredelta else np.NaN
    )
    return extraction

In [14]:
data = [
    extract(scenario)
    for scenario in cdata.orig_antecedent
]

In [15]:
df = pd.DataFrame.from_records(data, columns=['scenario','down','ytg','fieldpos','quarter','clock','scoredelta'])
df

Unnamed: 0,scenario,down,ytg,fieldpos,quarter,clock,scoredelta
0,It is first down and 10. The ball is on your o...,1.0,10.0,20.0,2.0,3.0,-3.0
1,It is second down and inches. The ball is on y...,2.0,0.0,5.0,2.0,3.0,-3.0
2,It is second down and inches. The ball is on y...,2.0,0.0,20.0,2.0,3.0,-3.0
3,It is second down and inches. The ball is on y...,2.0,0.0,5.0,4.0,3.0,-3.0
4,It is second down and inches. The ball is on y...,2.0,0.0,20.0,4.0,3.0,-3.0
5,It is second down and inches. The ball is on y...,2.0,0.0,55.0,4.0,3.0,-7.0
6,It is second down and inches. The ball is on y...,2.0,0.0,80.0,4.0,3.0,-7.0
7,It is second down and 3. The ball is on your o...,2.0,3.0,5.0,4.0,3.0,-3.0
8,It is second down and 3. The ball is on your o...,2.0,3.0,20.0,4.0,3.0,-3.0
9,It is second down and 3. The ball is on your o...,2.0,3.0,20.0,2.0,3.0,-7.0


In [None]:
df

In [None]:
cdata.loc[:,['scenario','down','ytg','fieldpos','quarter','clock','scoredelta']]=df

In [None]:
cdata['scenario'] = df.loc[:,'scenario']

In [None]:
cdata[(cdata.clock <= 10)&(cdata.down == 1)&(cdata.fieldpos)]

In [None]:
cdata.loc[:,'scenario']

In [19]:
cdata.loc[:,'antecedent']

0             kick a field goal
1             kick a field goal
2             kick a field goal
3             kick a field goal
4             kick a field goal
5                          pass
6                          pass
7             kick a field goal
8             kick a field goal
9                          pass
10                         pass
11            kick a field goal
12                         pass
13                   kneel down
14                   kneel down
15                   kneel down
16                   kneel down
17            kick a field goal
18                          run
19      Don't know / it depends
20                         pass
21                         pass
22                         pass
23                         pass
24            kick a field goal
25                         pass
26                         pass
27                          run
28      Don't know / it depends
29                         pass
                 ...           
3676    

In [21]:
df.loc[:,'guess'] = cdata.loc[:,'antecedent']

In [22]:
df

Unnamed: 0,scenario,down,ytg,fieldpos,quarter,clock,scoredelta,guess
0,It is first down and 10. The ball is on your o...,1.0,10.0,20.0,2.0,3.0,-3.0,kick a field goal
1,It is second down and inches. The ball is on y...,2.0,0.0,5.0,2.0,3.0,-3.0,kick a field goal
2,It is second down and inches. The ball is on y...,2.0,0.0,20.0,2.0,3.0,-3.0,kick a field goal
3,It is second down and inches. The ball is on y...,2.0,0.0,5.0,4.0,3.0,-3.0,kick a field goal
4,It is second down and inches. The ball is on y...,2.0,0.0,20.0,4.0,3.0,-3.0,kick a field goal
5,It is second down and inches. The ball is on y...,2.0,0.0,55.0,4.0,3.0,-7.0,pass
6,It is second down and inches. The ball is on y...,2.0,0.0,80.0,4.0,3.0,-7.0,pass
7,It is second down and 3. The ball is on your o...,2.0,3.0,5.0,4.0,3.0,-3.0,kick a field goal
8,It is second down and 3. The ball is on your o...,2.0,3.0,20.0,4.0,3.0,-3.0,kick a field goal
9,It is second down and 3. The ball is on your o...,2.0,3.0,20.0,2.0,3.0,-7.0,pass
