# Importing the Python libs

In [1]:
from selenium import webdriver  
from selenium.common.exceptions import NoSuchElementException  
from selenium.webdriver.common.keys import Keys  
from bs4 import BeautifulSoup
from bs4.element import Tag

import os
import pandas as pd
import datetime

# Configuration: defining urls and other stuff

In [2]:
url_basket_france = "http://www.oddsportal.com/basketball/france/lnb/results/"

basketball_results_history_csv = os.path.join("/", "home", "remy", "workspace",
                                              "Sport_result_scrapping",
                                              "scrapped_data",
                                              "basketball_history.csv")

# Gathering the html soup
The selenium part with webbrowser is necessary due to the javascript of the website.

In [3]:
browser = webdriver.Firefox() 
browser.get(url_basket_france)  
html_source = browser.page_source
browser.quit()

soup = BeautifulSoup(html_source,'lxml')  

In [4]:
#print(soup.prettify())

# Extracting the right information of interest

In [5]:
tournament_table = soup.find("table", id="tournamentTable")

In [6]:
#print(tournament_table.prettify())

In [7]:
def reformat_date(current_date_of_match, hour):
    """Convert the scrapped text to a clean datetime object
    
    In particular, handles the first raw which contains "Yesterday"
    instead of the full date
    """
    
    if "Yesterday, " in current_date_of_match:
        year = datetime.datetime.now().year
        current_date_of_match = current_date_of_match.replace("Yesterday, ", "")
        current_date_of_match = current_date_of_match + str(year)
        
    elif "Today, " in current_date_of_match:
        year = datetime.datetime.now().year
        current_date_of_match = current_date_of_match.replace("Today, ", "")
        current_date_of_match = current_date_of_match + str(year)

    full_date_str = current_date_of_match + " " + hour
    full_date_dt = datetime.datetime.strptime(full_date_str, "%d %b %Y %H:%M")
    
    return full_date_dt


def reformat_team_name(team_content):
    """Convert the scrapped text to a clean team name
    
    In particular, removes the " - " between local and visitor
    and removes the "<span>" "markups
    """
    
    if isinstance(team_content, Tag):
        team_content = team_content.contents[0]
    
    team_content = str(team_content).replace(" - ", "")
    
    return team_content

In [14]:
current_date_of_match = ""
final_table = []

for row in tournament_table.findAll("tr"):
     
    # getting the date
    th_elt = row.findAll("th")
    for elt in th_elt:
        span_elt = elt.find("span")
        if span_elt and "datet" in str(span_elt):
            current_date_of_match = span_elt.find(text=True)
    
    # getting the info per match
    cells = row.findAll("td")
    
    if len(cells) == 6:
        hour = cells[0].find(text=True)
        date = reformat_date(current_date_of_match, hour)
        
        match_name_contents = cells[1].find("a").contents
        locals = reformat_team_name(match_name_contents[0])
        visitors = reformat_team_name(match_name_contents[1])
        locals = str(local)
        visitors = str(visitors)
        
        score = cells[2].find(text=True)
        score_locals = score.split(":")[0]
        score_visitors = score.split(":")[1]
        
        odd_locals = cells[3].find(text=True)
        odd_visitors = cells[4].find(text=True)        
        nb_bookmakers = cells[5].find(text=True)
        
        final_table.append([date, locals, visitors, score_locals, score_visitors, odd_locals, odd_visitors, nb_bookmakers])

In [9]:
columns = ["date", "locals", "visitors", "score_locals", "score_visitors", "odd_local", "odd_visitor", "nb_bookmakers"]
basketball_history_df = pd.DataFrame(final_table, columns=columns)

basketball_history_df.head(5)

Unnamed: 0,date,local,visitor,score,odd_local,odd_visitor,nb_bookmakers
0,2018-06-02 16:30:00,Strasbourg,Le Mans,71:80,1.28,3.75,13
1,2018-06-01 18:45:00,Monaco,Limoges,88:71,1.15,5.75,13
2,2018-05-31 18:45:00,Strasbourg,Le Mans,76:66,1.36,3.23,13
3,2018-05-27 16:30:00,Le Mans,Lyon-Villeurbanne,79:68,1.53,2.56,13
4,2018-05-26 18:30:00,Dijon,Limoges,64:81,1.43,2.86,13


In [10]:
basketball_history_df.to_csv(basketball_results_history_csv, sep=";", index=False)