Skip to content

Latest commit

 

History

History
138 lines (112 loc) · 3.74 KB

Corbeau Entry Scrape.md

File metadata and controls

138 lines (112 loc) · 3.74 KB
jupyter
jupytext kernelspec
text_representation
extension format_name format_version jupytext_version
.md
markdown
1.0
0.8.6
display_name language name
Python 3
python
python3
url='https://mtc1.uk/Entry/CSRTC19/EntryList.php'
import unicodedata

def cleanString(s):
    s = unicodedata.normalize("NFKD", str(s))
    #replace multiple whitespace with single space
    s = ' '.join(s.split())
    
    return s
    
import requests
import pandas as pd
import lxml.html as LH
from bs4 import BeautifulSoup

html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string
    
tables = soup.find_all('table')
#tables = LH.fromstring(html).xpath('//table')
entries = pd.read_html('<html><body>{}</body></html>'.format(tables[0]))[0]
#entries['badge'] = [img.find('img')['src'] for img in tables[0].findAll("td", {"class": "final-results-icon"}) ]
#entries.dropna(how='all', axis=1, inplace=True)
entries.head()
classes = [div.text for div in tables[0].findAll("div", {"class": "entry-class"}) ]
print(classes)
row = tables[0].findAll("tr")[1]
cells= row.findAll('td')

entry = cells[1].text

divs = cells[2].findAll('div')
driver = cleanString( divs[0].text )
driver_nationality = divs[1].find('img')['alt']
driver_location = divs[1].text.strip()
driver_club = cleanString( divs[2].text )
#entry, driver, driver_nationality, driver_location, driver_club

divs = cells[3].findAll('div')
navigator = cleanString( divs[0].text )
navigator_nationality = divs[1].find('img')['alt']
navigator_location = divs[1].text.strip()
navigator_club = cleanString( divs[2].text )
#navigator, navigator_nationality, navigator_location, navigator_club

divs = cells[4].findAll('div')
entry_name = cleanString( divs[0].text )
entry_size = 
entry_colur
entry_type = cleanString( divs[1].text )
entry_sponsor = cleanString( divs[2].text )
entry_name, entry_type, entry_sponsor
df = pd.DataFrame()
for row in tables[0].findAll("tr"):
    cells= row.findAll('td')
    if not cells:
        continue

    entry = cells[1].text

    divs = cells[2].findAll('div')
    driver = cleanString( divs[0].text )
    driver_nationality = divs[1].find('img')['alt'] if divs[1].find('img') else ''
    driver_location = divs[1].text.strip()
    driver_club = cleanString( divs[2].text )
    #entry, driver, driver_nationality, driver_location, driver_club

    divs = cells[3].findAll('div')
    navigator =  divs[0].text
    navigator_nationality = divs[1].find('img')['alt'] if divs[1].find('img') else ''
    navigator_location = divs[1].text.strip()
    navigator_club = divs[2].text 
    #navigator, navigator_nationality, navigator_location, navigator_club

    divs = cells[4].findAll('div')
    entry_name =  divs[0].text
    entry_type = divs[1].text
    entry_sponsor = divs[2].text
    entry_name, entry_type, entry_sponsor
    
    entry_bits = entry_type.split('~')
    entry_cc = entry_bits[0]
    entry_colour = entry_bits[1]
    entry_reg = entry_bits[2]
    entry_year = entry_bits[3]
    
    df = pd.concat([df, pd.DataFrame([[entry, driver, driver_nationality, driver_location, driver_club,navigator,
                    navigator_nationality, navigator_location, navigator_club,
                   entry_name, entry_type, entry_sponsor, entry_cc, entry_colour, entry_reg,entry_year ]] )])
    
df.columns = ['Class', 'Driver','Driver_Nationality','Driver_Town','Driver_Clubs',
              'Co-driver','Co-driver_Nationality','Co-driver_Town','Co-driver_Clubs',
              'Car','Details', 'Sponsor','CC','Colour','Registration','Year']
df=df[1:]
df.head()
df.to_csv('corbeau19_entries.csv',index=False)
!ls *.csv