# Web Scraping Positions

## Setup Environment

In [1]:
import requests
import lxml.html as lh
import pandas as pd
# Using as guideline: https://towardsdatascience.com/web-scraping-html-tables-with-python-c9baba21059

## Create Function to Extract Table

In [177]:
def scrape_table(url, team, year=2019):

    #Create a handle, page, to handle the contents of the website
    page = requests.get(url)
    
    #Store the contents of the website under doc
    doc = lh.fromstring(page.content)
    
    #Parse data that are stored between <tr>..</tr> of HTML
    tr_elements = doc.xpath('//tr')
    
    ls = [len(T) for T in tr_elements]
    mode = max(set(ls), key=ls.count)
    
    #For each row, store each first element (header) and an empty list
    col = [(t.text_content().strip(),[]) for t in tr_elements[0]]

    #Since out first row is the header, data is stored on the second row onwards
    for j in range(1,len(tr_elements)):

        #T is our j'th row
        T=tr_elements[j]

        #If row is not of size mode, the //tr data is not from our table 
        if len(T)!=mode:
            break

        #i is the index of our column
        i=0

        #Iterate through each element of the row
        for t in T.iterchildren():
            data=str(t.text_content()).strip()
            #Append the data to the empty list of the i'th column
            col[i][1].append(data)
            #Increment i for the next column
            i+=1

    # [len(C) for (title,C) in col]
    df=pd.DataFrame({title:column for (title,column) in col})
    df['Name'] = df['Name'].str.split('\n').str[0].str.strip()
    df['Merge Name'] = df['Name'].str.split(' ',1).str[0].str[0] + '.' + df['Name'].str.split(' ').str[1]
    df['Team'] = team.replace('-',' ').title()
    df['Year'] = year
    
    return df

## Define Team List

In [174]:
team_ls = ['Arizona Cardinals','Atlanta Falcons','Baltimore Ravens','Buffalo Bills','Carolina Panthers','Chicago Bears','Cincinnati Bengals',
           'Cleveland Browns','Dallas Cowboys','Denver Broncos','Detroit Lions','Green Bay Packers','Houston Texans','Indianapolis Colts',
           'Jacksonville Jaguars','Kansas City Chiefs','Las Vegas Raiders','Los Angeles Chargers','Los Angeles Rams','Miami Dolphins','Minnesota Vikings','New England Patriots',
           'New Orleans Saints','New York Giants','New York Jets','Philadelphia Eagles','Pittsburgh Steelers','San Francisco 49ers','Seattle Seahawks',
           'Tampa Bay Buccaneers','Tennessee Titans','Washington Redskins']

team_ls = [team.lower().replace(' ','-') for team in team_ls]

## Iterate through List

In [175]:
url='https://www.lineups.com/nfl/roster/'
df = pd.DataFrame()

for team in team_ls:
    print("Starting {}".format(team))
    df = pd.concat([df, scrape_table(url+team, team)])
    
df.head()

Unnamed: 0,Pos,Name,Number,Rating,Ranking,Height,Weight,Age,Birthday,Exp.,Drafted,Draft Round,Draft Pick,College,Merge Name,Team,Year
0,QB,Kyler Murray,1,77,#27 QB,"5'10""",207,22,8/7/97,2,2019,1.0,1.0,Oklahoma,K.Murray,Arizona Cardinals,2019
1,QB,Drew Anderson,3,70,#62 QB,"6'4""",223,24,10/18/95,1,2019,,,Murray State,D.Anderson,Arizona Cardinals,2019
2,QB,Brett Hundley,7,67,#120 QB,"6'3""",226,27,6/15/93,6,2015,5.0,147.0,UCLA,B.Hundley,Arizona Cardinals,2019
3,RB,Kenyan Drake,41,82,#36 RB,"6'1""",210,26,1/26/94,5,2016,3.0,73.0,Alabama,K.Drake,Arizona Cardinals,2019
4,RB,Chase Edmonds,29,74,#128 RB,"5'9""",205,24,4/13/96,3,2018,4.0,134.0,Fordham,C.Edmonds,Arizona Cardinals,2019


## Using Selenium

In [104]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [165]:
driver = webdriver.Chrome()
driver.get(url+team)

xpath_row = "/html/body/app-root/app-nfl/app-roster/div/div/div[2]/div[2]/div/div[2]/div/div/table/tbody/tr"
xpath_col = "/html/body/app-root/app-nfl/app-roster/div/div/div[2]/div[2]/div/div[2]/div/div/table/tbody/tr/td"

xpath_hrow = "/html/body/app-root/app-nfl/app-roster/div/div/div[2]/div[2]/div/div[2]/div/div/table/thead/tr"
xpath_hcol = "/html/body/app-root/app-nfl/app-roster/div/div/div[2]/div[2]/div/div[2]/div/div/table/thead/tr/th"

row = len(driver.find_elements_by_xpath(xpath_row))
col = int(len(driver.find_elements_by_xpath(xpath_col))/row)

hrow = len(driver.find_elements_by_xpath(xpath_hrow))
hcol = int(len(driver.find_elements_by_xpath(xpath_hcol))/hrow)


In [166]:
header = [driver.find_element_by_xpath("/html/body/app-root/app-nfl/app-roster/div/div/div[2]/div[2]/div/div[2]/div/div/table/thead/tr[1]/th[{}]".format(str(c))).text
         for c in range(1,hcol+1)]

In [170]:
table_ls=[]
for r in range(1,row+1):
    row_ls = []
    for c in range (1,col+1):
        row_ls.append(driver.find_element_by_xpath("/html/body/app-root/app-nfl/app-roster/div/div/div[2]/div[2]/div/div[2]/div/div/table/tbody/tr[{}]/td[{}]".format(str(r),str(c))).text)
    table_ls.append(row_ls)


1, 1
1, 2
1, 3
1, 4
1, 5
1, 6
1, 7
1, 8
1, 9
1, 10
1, 11
1, 12
1, 13
1, 14
2, 1
2, 2
2, 3
2, 4
2, 5
2, 6
2, 7
2, 8
2, 9
2, 10
2, 11
2, 12
2, 13
2, 14
3, 1
3, 2
3, 3
3, 4
3, 5
3, 6
3, 7
3, 8
3, 9
3, 10
3, 11
3, 12
3, 13
3, 14
4, 1
4, 2
4, 3
4, 4
4, 5
4, 6
4, 7
4, 8
4, 9
4, 10
4, 11
4, 12
4, 13
4, 14
5, 1
5, 2
5, 3
5, 4
5, 5
5, 6
5, 7
5, 8
5, 9
5, 10
5, 11
5, 12
5, 13
5, 14
6, 1
6, 2
6, 3
6, 4
6, 5
6, 6
6, 7
6, 8
6, 9
6, 10
6, 11
6, 12
6, 13
6, 14
7, 1
7, 2
7, 3
7, 4
7, 5
7, 6
7, 7
7, 8
7, 9
7, 10
7, 11
7, 12
7, 13
7, 14
8, 1
8, 2
8, 3
8, 4
8, 5
8, 6
8, 7
8, 8
8, 9
8, 10
8, 11
8, 12
8, 13
8, 14
9, 1
9, 2
9, 3
9, 4
9, 5
9, 6
9, 7
9, 8
9, 9
9, 10
9, 11
9, 12
9, 13
9, 14
10, 1
10, 2
10, 3
10, 4
10, 5
10, 6
10, 7
10, 8
10, 9
10, 10
10, 11
10, 12
10, 13
10, 14
11, 1
11, 2
11, 3
11, 4
11, 5
11, 6
11, 7
11, 8
11, 9
11, 10
11, 11
11, 12
11, 13
11, 14
12, 1
12, 2
12, 3
12, 4
12, 5
12, 6
12, 7
12, 8
12, 9
12, 10
12, 11
12, 12
12, 13
12, 14
13, 1
13, 2
13, 3
13, 4
13, 5
13, 6
13, 7
13, 8
13, 9
13, 

In [172]:
df=pd.DataFrame(simple_list,columns=['col1','col2'])
df['Name'] = df['Name'].str.split('\n').str[0].str.strip()
df['Merge Name'] = df['Name'].str.split(' ',1).str[0].str[0] + '.' + df['Name'].str.split(' ').str[1]
df['Team'] = team.replace('-',' ').title()
    df['Year'] = year

TypeError: unhashable type: 'list'