In [1]:
import requests #imports
from bs4 import BeautifulSoup #https://realpython.com/beautiful-soup-web-scraper-python/
import pandas as pd
import re #https://stackoverflow.com/questions/17336943/removing-non-numeric-characters-from-a-string
import numpy as np 
#https://medium.com/analytics-vidhya/how-to-scrape-a-table-from-website-using-python-ce90d0cfb607

In [2]:
#make the soup of HTML
def initialize(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id="fittPageContainer")
    tables = results.find_all("div", class_="ResponsiveTable ResponsiveTable--fixed-left pt4")
    return results, tables

#Find the name of the player
def nameandposition(results):
    name_and_position = results.find_all("div", class_="PlayerHeader__Main_Aside min-w-0 flex-grow flex-basis-0")
    for element in name_and_position:
        name = element.find("h1", class_="PlayerHeader__Name")
        name = name.text.strip()
        
    for element in name_and_position:
        pos = element.find("ul", class_="PlayerHeader__Team_Info list flex pt1 pr4 min-w-0 flex-basis-0 flex-shrink flex-grow nowrap")
    numrole = pos.text.strip().split('#')[1]
    role = ''.join(i for i in numrole if not i.isdigit())
    return name, role

def basic_data(results):
    immutable_elements = results.find_all("div", class_="fw-medium clr-black")
    HTWT = immutable_elements[0].text.strip()
    DOB = immutable_elements[1].text.strip()
    COLLEGE = immutable_elements[2].text.strip()
    Draftpick = immutable_elements[3].text.strip()
    activity_status = immutable_elements[4].text.strip()
    
    #Seperate Height and Weight
    x = HTWT.split(', ')
    ht = x[0]
    wt = x[1]
    #process height
    y = ht.split(' ')
    ft = y[0]
    inch = y[1]
    ft = int(re.sub('[^0-9]','', ft))
    inch = int(re.sub('[^0-9]','', inch))
    height = ((12*ft)+inch)
    #process weight
    weight = int(re.sub('[^0-9]','', wt))
    
    #process DOB
    x = DOB.split(' (')
    DOB = x[0]
    y = DOB.split('/')
    DOB = int(re.sub('[^0-9]','', y[2]))
    
    #process Draft pick data
    x = Draftpick.split(': ')
    draftyear = int(re.sub('[^0-9]','', x[0]))
    y = x[1].split(', Pk ')
    draftround = int(re.sub('[^0-9]','', y[0]))
    z = y[1].split(' ')
    draftpick = int(re.sub('[^0-9]','', z[0]))
    draftteam = z[1]
    
    return height, weight, DOB, COLLEGE, draftyear, draftround, draftpick, draftteam, activity_status

def build_basedata(results):
    name, position = nameandposition(results)
    height, weight, DOB, COLLEGE, draftyear, draftround, draftpick, draftteam, activity_status = basic_data(results)
    data = {
        'name':[name],
        'position':[position],
        'height (inches)':[height],
        'weight (pounds)':[weight],
        'DOB':[DOB],
        'COLLEGE':[COLLEGE],
        'draftyear':[draftyear],
        'draftround':[draftround],
        'draftpick':[draftpick],
        'draftteam':[draftteam],
        'activity_status':[activity_status]}
    dataframe = pd.DataFrame(data)
    return dataframe

def maketable(data, statname):
    #Find the first part of the table from the html
    data1 = data.find("table", class_="Table Table--align-right Table--fixed Table--fixed-left")
    
    # Obtain every title of columns with tag <th>
    headers = []
    for i in data1.find_all("th"):
        title = i.text
        headers.append(title)
    
    # Create a dataframe
    data1data = pd.DataFrame(columns = headers)

    # Create a for loop to fill table
    for j in data1.find_all("tr")[1:]:
        row_data = j.find_all("td")
        row = [i.text for i in row_data]
        length = len(data1data)
        data1data.loc[length] = row
    
    #find the second part of the table from the html
    data2 = data.find("table", class_="Table Table--align-right")
    
    # Obtain every title of columns with tag <th>
    headers = []
    for i in data2.find_all("th"):
        title = i.text
        headers.append(title)
    
    #Add name tag to all headers
    append_str = statname
    headers = [append_str + sub for sub in headers]
    
    # Create a dataframe
    data2data = pd.DataFrame(columns = headers)
    
    # Create a for loop to fill mydata
    for j in data2.find_all("tr")[1:]:
        row_data = j.find_all("td")
        row = [i.text for i in row_data]
        length = len(data2data)
        data2data.loc[length] = row
        
    datadata = pd.concat([data1data,data2data], axis = 1)
    datadata.drop(datadata.tail(1).index,inplace=True)
    return datadata

def makereturningtable(data):
    
    data1headers = data.find("table", class_="Table Table--align-right Table--fixed Table--fixed-left")
    
    headers = []
    for i in data1headers.find_all("th"):
        title = i.text
        headers.append(title)
        
    data1data = pd.DataFrame(columns = headers[1:])
    
    data1 = data.find("tbody", class_="Table__TBODY")
    
    for j in data1.find_all("tr")[0:]:
        row_data = j.find_all("td")
        row = [i.text for i in row_data]
        length = len(data1data)+2
        data1data.loc[length] = row
        
        
    data1data = data1data.reset_index()
    del data1data['index']

    data2headers = data.find("table", class_="Table Table--align-right")
    headers = []
    for i in data2headers.find_all("th"):
        title = i.text
        headers.append(title)
    append_str = "returning: "
    
    for i in range(0, 8):
        headers[i] = "PUNTS: " + headers[i]
    for i in range(8, 13):
        headers[i] = "KICKOFFS: " + headers[i]
    
    headers = [append_str + sub for sub in headers]
    
    data2data = pd.DataFrame(columns = headers[2:])
    
    data2 = data.find_all("tbody", class_="Table__TBODY")[1]
    for j in data2.find_all("tr")[0:]:
        row_data = j.find_all("td")
        row = [i.text for i in row_data]
        length = len(data2data)
        data2data.loc[length] = row

    
    datadata = pd.concat([data1data,data2data], axis = 1)
    datadata.drop(datadata.tail(1).index,inplace=True)
    return datadata

def bulkdata(results, tables):
    returnlist = []
    for i in range(0,len(tables)):
        if "Passing" in tables[i].text:
            passingdat = maketable(tables[i], "Passing: ")
            returnlist.append(passingdat)
        elif "Rushing" in tables[i].text:
            rushingdat = maketable(tables[i], "Rushing: ")
            returnlist.append(rushingdat)
        elif "Receiving" in tables[i].text:
            receivingdat = maketable(tables[i], "Receiving: ")
            returnlist.append(receivingdat)
        elif "Returning" in tables[i].text:
            returningdat = makereturningtable(tables[i])
            returnlist.append(returningdat)
        elif "Defensive" in tables[i].text:
            defensivedat = maketable(tables[i], "Defensive: ")
            returnlist.append(defensivedat)
        elif "Scoring" in tables[i].text:
            scoringdat = maketable(tables[i], "Scoring: ")
            returnlist.append(scoringdat)
    return returnlist

def combinedata(a, basedata):
    length = len(a)
    finaldata = pd.merge(a[0], a[1], on=['season','Team'], how = 'outer')
    for i in range(2,length):
        finaldata = pd.merge(finaldata, a[i], on=['season','Team'], how = 'outer')
        
    rows = len(finaldata.index)
    
    #source: https://stackoverflow.com/questions/50788508/how-can-i-replicate-rows-in-pandas
    addition = basedata
    addition = pd.DataFrame(np.repeat(addition.values, rows, axis=0))
    addition.columns = basedata.columns
    
    finaldata = pd.concat([addition,finaldata], axis = 1)
    
    finaldata = finaldata.apply(pd.to_numeric, errors='ignore')
    #add age, source:https://towardsdatascience.com/create-new-column-based-on-other-columns-pandas-5586d87de73d
    finaldata['age'] = finaldata.apply(lambda row: row.season - row.DOB, axis=1)
    
    #convert all numerical cells to int or float
    
    return finaldata

def fantasy_add(dataframe1, url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find('div', class_="Page-shell")
    table = results.find("table", class_="TableBase-table")
    
    
    dataheaders = table.find("tr", class_="TableBase-headTr")
    #headers = []
    headers = ['season','team','games','fantasy points', 'fantasy points per game', 'rushing attempts', 'rushing yards', 'avg yards per rush', 'rushing touchdowns', 'longest rush', 'rushing first downs', 'recieving targets', 'receptions','recieving yards', 'average yards per reception', 'recieving touchdowns', 'longest reception','recieving first downs','total fumbles','fumbles lost']
    #for i in dataheaders.find_all("th"):
        #title = i.text
        #headers.append(title)

    tabledata = pd.DataFrame(columns = headers)
    
    data = table.find("tbody")
    
    for j in data.find_all("tr")[0:]:
        row_data = j.find_all("td")
        row = [i.text for i in row_data]
        length = len(tabledata)
        tabledata.loc[length] = row

    tabledata = tabledata.replace(r'\n',' ', regex=True) 
    
    fantasydata = tabledata[['season','fantasy points','fantasy points per game']]
    fantasydata = fantasydata.drop([0, 1, 2])
    fantasydata.drop(fantasydata.tail(1).index,inplace=True)
    
    fantasydata = fantasydata.apply(pd.to_numeric, errors='ignore')
    dataframe1 = dataframe1.apply(pd.to_numeric, errors='ignore')
    
    finaldata = pd.merge(dataframe1, fantasydata, on=['season'], how = 'outer')
    
    return finaldata

In [3]:
def fullscrape(espn, cbs):
    results, tables = initialize(espn)
    name, position = nameandposition(results)
    height, weight, DOB, COLLEGE, draftyear, draftround, draftpick, draftteam, activity_status = basic_data(results)
    basedata = build_basedata(results)
    a = bulkdata(results,tables)
    df = combinedata(a, basedata)
    df1 = fantasy_add(df, cbs)
    return df1

In [4]:
df = fullscrape("https://www.espn.com/nfl/player/stats/_/id/4040761/devin-singletary", "https://www.cbssports.com/nfl/players/2241251/devin-singletary/career-stats/")

In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # https://stackoverflow.com/questions/19124601/pretty-print-an-entire-pandas-series-dataframe
    print(df)

              name      position  height (inches)  weight (pounds)   DOB  \
0  DevinSingletary  Running Back               67              203  1997   
1  DevinSingletary  Running Back               67              203  1997   
2  DevinSingletary  Running Back               67              203  1997   

  COLLEGE  draftyear  draftround  draftpick draftteam activity_status  season  \
0     FAU       2019           3         74     (BUF)          Active    2019   
1     FAU       2019           3         74     (BUF)          Active    2020   
2     FAU       2019           3         74     (BUF)          Active    2021   

  Team  Rushing: GP  Rushing: ATT  Rushing: YDS  Rushing: AVG  Rushing: TD  \
0  BUF           12           151           775           5.1            2   
1  BUF           16           156           687           4.4            2   
2  BUF           17           188           870           4.6            7   

   Rushing: LNG  Rushing: FD  Rushing: FUM  Rushing: LST 