### Web-scrape for horse results

In [336]:
import os
import sys
import requests
import urllib
import urllib.request
from bs4 import BeautifulSoup
from csv import writer
import re

### Setting path and primary URL

In [337]:
PATH = "usr/local/bin/chromedriver"
URL = "https://tnetwork.trakus.com/tnet/t_NYRA.aspx?EventID=205193&Date=7/10/2022&Type=TBRED&Venue=23&DisplayType=1"

### Importing URLs

In [338]:
urls = pd.read_csv("urllist.csv")
urls_list = list(urls)
urls_series = pd.Series(urls_list)

### Scraping tables from URLS

In [339]:
def scrape_table(URL):
    #Scrapes the table from the trakus website. 
    #Returns pandas df - will still require some cleaning. 
    dfs = pd.read_html(URL)
    df = pd.DataFrame(dfs[4]) # all the required data is in the fourth level of the output.
    df = df.iloc[:,[2,3,4,5,7,8]]
    df.columns = ["Horse name", "Start", "1/4", "1/2", "3/4", "Finish"]
    
    return df

In [340]:
df = scrape_table(urls_list[0])
df.head()

Unnamed: 0,Horse name,Start,1/4,1/2,3/4,Finish
0,Jc's Shooting Star,5,42 23.77,43 47.92,25 3/4 1:12.89,1Neck 1:19.52
1,Sounds Delicious,11/4,13/4 23.23,11 1/2 47.35,11 1/4 1:12.69,26 3/4 1:19.56
2,Crimson Frost,41/4,5 24.06,5 48.42,41 1/4 1:14.05,31 1/2 1:20.74
3,Friend of Liberty,2Head,21/4 23.33,23/4 47.60,31/2 1:13.95,41 1/4 1:21.05
4,Bobby's Song,31/4,32 3/4 23.36,31 47.75,5 1:14.27,5 1:21.31


### Cleaning tables

In [341]:
def clean_table(df):
    df["Finish_position"] = df.index + 1
    df["Finishing time"] = df.Finish.str.split()
    times = []
    for i in df["Finishing time"]:
        times.append(i[-1])
    df["Finishing time"] = times
    
    return df

In [342]:
output = clean_table(df)
output.head()

Unnamed: 0,Horse name,Start,1/4,1/2,3/4,Finish,Finish_position,Finishing time
0,Jc's Shooting Star,5,42 23.77,43 47.92,25 3/4 1:12.89,1Neck 1:19.52,1,1:19.52
1,Sounds Delicious,11/4,13/4 23.23,11 1/2 47.35,11 1/4 1:12.69,26 3/4 1:19.56,2,1:19.56
2,Crimson Frost,41/4,5 24.06,5 48.42,41 1/4 1:14.05,31 1/2 1:20.74,3,1:20.74
3,Friend of Liberty,2Head,21/4 23.33,23/4 47.60,31/2 1:13.95,41 1/4 1:21.05,4,1:21.05
4,Bobby's Song,31/4,32 3/4 23.36,31 47.75,5 1:14.27,5 1:21.31,5,1:21.31


### Getting race numbers

Race numbers are helf in the gif names. Srape all gif names. 

- Example gif name: src="images/HorseRacingTiles/NYRA/4.gif" - where 4 is the horse number. 
- The images will be scraped in order, so we can append the dataframes with the ordered list. 

In [343]:
def extract_race_number(url):
    img_tags = []
    race_numbers = []
    baseUrl = requests.get(url)
    soup = BeautifulSoup(baseUrl.text, 'html.parser')
    allImgs = soup.findAll('img')
    
    for i in allImgs:
        if "images/HorseRacingTiles/NYRA" in str(i):
            
            img_tags.append(str(i))
    
    for i in img_tags:
        m = re.search('src="images/HorseRacingTiles/NYRA/(.+?).gif', i)
        if m:
            found = m.group(1)
        race_numbers.append(str(found))
        
    return race_numbers
    
    

In [344]:
extract_race_number(urls_list[0])

['5', '1', '2', '3', '4']

### Extracting race date from url

In [345]:
def get_date(url):
    m = re.search('Date=(.+?)&Type', url)
    if m:
        found = m.group(1)
    
    return found

get_date(urls_list[1])


'1/1/2019'

In [346]:
urls_list[1]

'https://tnetwork.trakus.com/tnet/t_NYRA.aspx?EventID=149757&Date=1/1/2019&Type=TBRED&Venue=24&DisplayType=1'

### Extracting Event ID

In [347]:
def get_event_id(url):
    m = re.search('EventID=(.+?)&Date', url)
    if m:
        found = m.group(1)
    
    return found

get_event_id(urls_list[1])

'149757'

### Full scrape

In [348]:
def full_scrape(url):
    df = scrape_table(url)
    clean_df = clean_table(df)
    race_numbers = extract_race_number(url)
    clean_df["program_number"] = race_numbers
    clean_df["race_date"] = get_date(url)
    clean_df["event_id"] = get_event_id(url)

    
    return clean_df

df = full_scrape(urls_list[1])
df.head()

Unnamed: 0,Horse name,Start,1/4,1/2,3/4,Finish,Finish_position,Finishing time,program_number,race_date,event_id
0,Bourbon N Rye,1Head,13/4 23.02,11 1/2 47.99,13 1/4 1:00.42,16 1:13.44,1,1:13.44,6,1/1/2019,149757
1,Bourbon Did It,3Head,21 3/4 23.16,23 3/4 48.24,25 1/4 1:00.97,25 1/4 1:14.59,2,1:14.59,1,1/1/2019,149757
2,Tenency,5Head,33/4 23.48,32 48.90,32 1/4 1:01.89,31/4 1:15.57,3,1:15.57,5,1/1/2019,149757
3,Legion Storm,4Head,53/4 23.61,4Head 49.27,42 1/2 1:02.25,45 3/4 1:15.64,4,1:15.64,3,1/1/2019,149757
4,Frost's Song,2Head,63 1/4 23.73,62 3/4 49.64,53/4 1:02.71,53 1/2 1:16.74,5,1:16.74,4,1/1/2019,149757


### Collating all datasets

In [358]:
urls_list = urls_list[:20]

def collate_datasets(list_of_urls):
    df = full_scrape(list_of_urls[0])
    
    for i in range(1, len(list_of_urls)):
        df2 = full_scrape(list_of_urls[i])
        #df = pd.concat(df, df2)
    
    return df

collate_datasets(urls_list)

TypeError: 'float' object is not subscriptable

In [355]:
len(urls_list)

2031

In [362]:

for i in range(20):
    full_scrape(urls_list[10])

In [364]:
len(urls_list)

20