In [2]:
import re
import requests
import pandas as pd
from lxml import html

In [203]:
CURR_WEEK = 13
root_url = "http://www.nfl.com/injuries?week={}"
urls = []
for week in range(1, CURR_WEEK):
    urls.append((week, root_url.format(str(week))))

In [93]:
def parse_page(url):
    page = requests.get(url)
    tree = html.fromstring(page.content).xpath('//ul[@id="injuries-players-list"]/li')
    js_to_parse = []
    for elem in tree:
        attribs = elem.classes._attributes
        class_name = 'schedules-list-matchup'
        if class_name in attribs['class']:
            js_text = elem.xpath('//div[@class="player-expanded"]/script/text()')
            js_to_parse.append("".join(js_text))
    rawtext = "".join(js_to_parse)
    return rawtext

def extract_data(text, week):
    regex = '{player: [0-9A-Za-z\s"\-,:]*[\s]*}'
    m = re.findall(regex, text)
    dict_text = []
    for item in m:
        row = re.sub('([A-Za-z]*):', '"\g<1>":', item)
        dict_text.append(row)
    dict_data = []
    for row in dict_text:
        dict_data.append(eval(row))
    df = pd.DataFrame(dict_data).drop_duplicates(subset=['esbId'])
    df['week'] = week
    return df

In [205]:
injuries = []
for week, url in urls:
    raw = parse_page(url)
    df = extract_data(raw, week)
    injuries.append(df)

In [206]:
injuries_weekly = pd.concat(injuries)

In [213]:
injuries_weekly.to_csv('injuries_weekly.csv')

In [207]:
injuries_weekly.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3241 entries, 0 to 321
Data columns (total 9 columns):
esbId             3241 non-null object
firstName         3241 non-null object
gameStatus        3241 non-null object
injury            3241 non-null object
lastName          3241 non-null object
player            3241 non-null object
position          3241 non-null object
practiceStatus    3241 non-null object
week              3241 non-null int64
dtypes: int64(1), object(8)
memory usage: 151.9+ KB


In [208]:
injuries_weekly.week.value_counts(dropna=False)

12    322
4     307
5     289
7     284
3     282
11    276
6     274
10    270
9     268
8     263
2     239
1     167
Name: week, dtype: int64

In [209]:
injuries_weekly.gameStatus.value_counts(dropna=False)

--              1557
Questionable     966
Out              603
Doubtful         115
Name: gameStatus, dtype: int64

In [210]:
injuries_weekly.position.value_counts(dropna=False)

LB    509
WR    389
CB    360
S     281
T     278
DE    269
RB    252
DT    244
G     209
TE    175
C     128
QB    110
K      24
LS      9
P       4
Name: position, dtype: int64

In [211]:
injuries_weekly.practiceStatus.value_counts(dropna=False)

Full Participation in Practice       1585
Did Not Participate In Practice       907
Limited Participation in Practice     749
Name: practiceStatus, dtype: int64

In [212]:
injuries_weekly.injury.value_counts(dropna=False)

--                    1557
Knee                   321
Ankle                  254
Hamstring              213
Shoulder               115
Concussion             109
Back                    81
Groin                   80
Foot                    64
Calf                    52
Neck                    44
Quadricep               36
Illness                 34
Hip                     30
Thigh                   27
Ribs                    23
Hand                    23
right Shoulder          21
Chest                   19
Elbow                   12
Toe                     12
Achilles                12
Abdomen                 10
Thumb                   10
Wrist                    9
Rib                      9
Shin                     9
Core Muscle              8
Tricep                   5
Finger                   5
Eye                      5
right Groin              5
Biceps                   4
Stinger                  3
Lower Leg                2
Triceps                  2
right Knee               2
O

## esbid match

In [32]:
profiles = pd.read_csv('profile_urls.csv').drop('Unnamed: 0',axis=1)

In [4]:
profile_urls = profiles.profile_url.drop_duplicates().tolist()

In [186]:
esbids_urls = []
for url in profile_urls:
    page = requests.get(url)
    comments = html.fromstring(page.content).xpath('//html//comment()')
    comments_list = [x.text for x in comments]
    comments_text = "".join(comments_list)
    regex = 'ESB ID: (.*?)\n'
    m = re.findall(regex, comments_text)
    esbid = m[0].strip()
    esbids_urls.append({"esbId":esbid, "profile_url":url})

In [190]:
esbids_df = pd.DataFrame(esbids_urls)

In [191]:
esbids_df.to_csv('esbids_urls.csv')

In [215]:
esbids_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436 entries, 0 to 435
Data columns (total 2 columns):
esbId          436 non-null object
profile_url    436 non-null object
dtypes: object(2)
memory usage: 3.4+ KB
