# Importing the Python libs

In [1]:
from selenium import webdriver  
from selenium.common.exceptions import NoSuchElementException  
from selenium.webdriver.common.keys import Keys  
from bs4 import BeautifulSoup
from bs4.element import Tag

import os
import pandas as pd
import datetime

# Configuration: defining urls and other stuff

In [2]:
url_basket_france = "http://www.oddsportal.com/basketball/france/lnb/results/"

basketball_results_history_csv = os.path.join("/", "home", "remy", "workspace",
                                              "Sport_result_scrapping",
                                              "scrapped_data",
                                              "basketball_history.csv")

# Gathering the html soup
The selenium part with webbrowser is necessary due to the javascript of the website.

In [3]:
browser = webdriver.Firefox() 
browser.get(url_basket_france)  
html_source = browser.page_source
browser.quit()

soup = BeautifulSoup(html_source,'lxml')  

In [4]:
#print(soup.prettify())

# Extracting the right information of interest

In [5]:
tournament_table = soup.find("table", id="tournamentTable")

In [6]:
#print(tournament_table.prettify())

In [7]:
def date_reformat(current_date_of_match, hour):
    """Convert the scrapped text to a clean datetime object
    
    In particular, handles the first raw which contains "Yesterday"
    instead of the full date
    """
    
    if "Yesterday, " in current_date_of_match:
        year = datetime.datetime.now().year
        current_date_of_match = current_date_of_match.replace("Yesterday, ", "")
        current_date_of_match = current_date_of_match + str(year)

    full_date_str = current_date_of_match + " " + hour
    full_date_dt = datetime.datetime.strptime(full_date_str, "%d %b %Y %H:%M")
    
    return full_date_dt


def reformat_team_name(team_content):
    """Convert the scrapped text to a clean team name
    
    In particular, removes the " - " between local and visitor
    and removes the "<span>" "markups
    """
    
    if isinstance(team_content, Tag):
        team_content = team_content.contents[0]
    
    team_content = str(team_content).replace(" - ", "")
    
    return team_content

In [8]:
current_date_of_match = ""
final_table = []

for row in tournament_table.findAll("tr"):
     
    # getting the date
    th_elt = row.findAll("th")
    for elt in th_elt:
        span_elt = elt.find("span")
        if span_elt and "datet" in str(span_elt):
            current_date_of_match = span_elt.find(text=True)
    
    # getting the info per match
    cells = row.findAll("td")
    
    if len(cells) == 6:
        hour = cells[0].find(text=True)
        date = date_reformat(current_date_of_match, hour)
        
        match_name_contents = cells[1].find("a").contents
        local = reformat_team_name(match_name_contents[0])
        visitor = reformat_team_name(match_name_contents[1])
        
        reformat_team_name(local)
        reformat_team_name(visitor)
        local = str(local)
        locavisitorl = str(visitor)
        
        score = cells[2].find(text=True)        
        odd_local = cells[3].find(text=True)
        odd_visitor = cells[4].find(text=True)        
        nb_bookmakers = cells[5].find(text=True)
        
        final_table.append([date, local, visitor, score, odd_local, odd_visitor, nb_bookmakers])

In [9]:
columns = ["date", "local", "visitor", "score", "odd_local", "odd_visitor", "nb_bookmakers"]
basketball_history_df = pd.DataFrame(final_table, columns=columns)

basketball_history_df.head(5)

Unnamed: 0,date,local,visitor,score,odd_local,odd_visitor,nb_bookmakers
0,2018-05-05 18:00:00,Boulazac,Hyeres-Toulon,71:62,1.3,3.59,13
1,2018-05-05 18:00:00,Cholet,JL Bourg,69:73,2.45,1.58,13
2,2018-05-05 18:00:00,Strasbourg,Antibes,93:77,1.06,9.0,13
3,2018-05-05 16:30:00,Levallois,Chalon/Saone,83:72,1.54,2.53,13
4,2018-05-04 18:00:00,Chalons-Reims,Dijon,79:92,2.0,1.82,13


In [10]:
basketball_history_df.to_csv(basketball_results_history_csv, sep=";", index=False)