In [1]:
import io
import json
import os
import sys
import pandas as pd
from bs4 import BeautifulSoup as bs

import requests
from requests.sessions import Session

import numpy as np
from scipy import stats
import datetime
import time
import statistics as st
import requests_cache
import concurrent
from threading import Thread,local
import asyncio
import aiohttp
import tqdm

from sklearn.linear_model import LinearRegression

from collections.abc import MutableMapping

In [2]:
headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
}

hippo_letrot = ["LE MONT-SAINT-MICHEL-PONTORSON", "BORDEAUX", "LE CROISE-LAROCHE"]
hippo_pmu =  ["LE MONT SAINT MICHEL", "LE BOUSCAT", "LE CROISE LAROCHE"]

requests_cache.install_cache('turf_cache')

# Prediction

In [3]:
import nest_asyncio
nest_asyncio.apply()

thread_local = local()

def get_session() -> Session:
    if not hasattr(thread_local, 'session'):
        thread_local.session =  requests.Session()
    return thread_local.session

def get_request_with_session(url:str):
    session = get_session()
    with session.get(url, headers=headers) as response:
        return response

def gen_rows(df):
    for row in df.itertuples(index=False):
        yield row._asdict()

### OOP

In [9]:
class Programme():
    def __init__(self, debut, fin):
        self.date_debut = datetime.date.fromisoformat("-".join(debut.split("-")[::-1]))
        self.date_fin = datetime.date.fromisoformat("-".join(fin.split("-")[::-1]))
        
        n_days = self.date_fin - self.date_debut
        cur = max(self.date_fin - datetime.timedelta(days=90), self.date_debut)
        
        intervalle_date = [self.date_fin, cur]
        
        while cur > self.date_debut:
            cur = max(self.date_debut, cur - datetime.timedelta(days=90))
            
            intervalle_date.append(cur)        
        self.intervalles = [(i,j + datetime.timedelta(days=1)) for i,j in zip(intervalle_date, intervalle_date[1:])]
        
        loop = asyncio.get_event_loop()
        programme = loop.run_until_complete(asyncio.gather(*[self._get_programme_from_letrot(inter) for inter in self.intervalles]))
        
        programme = [item for sublist in programme for item in sublist]
        
        self.programme = pd.DataFrame(programme)

    async def combined_prog(self):
        return await asyncio.gather(*[self._get_programme_from_letrot(inter) for inter in self.intervalles])


    async def _get_pmu_program(self, session, date):
        date_pmu = date.strftime("%d%m%Y")
        async with session.get(f"https://online.turfinfo.api.pmu.fr/rest/client/65/programme/{date_pmu}/") as res:
            try:
                return await res.json()
            except:
                return None

    async def _get_programme_from_letrot(self, date):
        
        debut = date[1].strftime("%d-%m-%Y")
        fin = date[0].strftime("%d-%m-%Y")
#         print(debut)
        programme = []
        
        url = f"https://www.letrot.com/fr/courses/calendrier-resultats?publish_up={debut}&publish_down={fin}"
        r = get_request_with_session(url)
        soup = bs(r.text, "html.parser")
        reunion_raw = soup.find_all("a", {"class": "racesHippodrome"})
        current_date_reunion = "0"
        current_programme = {}
        
        num_days = (date[0] - date[1]).days + 1
        date_list = [date[0] - datetime.timedelta(days=x) for x in range(num_days)]
        prog_pmu = {}
        
        tasks = []
        async with aiohttp.ClientSession() as session:
            for d in date_list:
                tasks.append(self._get_pmu_program(session, d))
            res_prog_pmu = await asyncio.gather(*tasks)    
        for i in range(len(res_prog_pmu)):
            prog_pmu.update({date_list[i].strftime("%Y-%m-%d"): res_prog_pmu[i]})

        
        
        for i in range(len(reunion_raw)):
            reunion = reunion_raw[i]
            date = reunion.get("href").split("/")[-2]
            hippodrome = reunion.text[2:].strip()
            for i in range(len(hippo_letrot)):
                hippodrome = hippodrome.replace(hippo_letrot[i], hippo_pmu[i])
            
            hippodrome = hippodrome.replace(" (A ", " ").replace(")", "")
            date_pmu = "".join(date.split("-")[::-1])
            
            if date in prog_pmu:
                current_programme = prog_pmu[date]
            else:
                continue
                
            numReunion = 0
            for reunion_pmu in current_programme["programme"]["reunions"]:
                if hippodrome in reunion_pmu["hippodrome"]["libelleCourt"]:
                    numReunion = reunion_pmu["numOfficiel"]
            
            if numReunion == 0:
                continue
            course = {"date": date, "idHippo": reunion.get("href").split("/")[-1], "Hippodrome": hippodrome, "lien": reunion.get("href")}
            course["numReunion"] = numReunion
            programme.append(course)
        return programme


In [10]:
P = Programme("01-01-2022", "25-11-2022")
P.programme

Unnamed: 0,date,idHippo,Hippodrome,lien,numReunion
0,2022-08-28,8201,BEAUMONT DE LOMAGNE,/stats/courses/programme/2022-08-28/8201,3
1,2022-08-28,4301,JULLIANGES,/stats/courses/programme/2022-08-28/4301,7
2,2022-08-28,5305,LAVAL,/stats/courses/programme/2022-08-28/5305,8
3,2022-08-29,7500,VINCENNES,/stats/courses/programme/2022-08-29/7500,1
4,2022-08-30,1475,CABOURG,/stats/courses/programme/2022-08-30/1475,5
...,...,...,...,...,...
669,2022-02-27,5008,GRAIGNES,/stats/courses/programme/2022-02-27/5008,10
670,2022-02-27,3507,MAURE DE BRETAGNE,/stats/courses/programme/2022-02-27/3507,11
671,2022-02-27,3001,NIMES,/stats/courses/programme/2022-02-27/3001,9
672,2022-02-27,5102,REIMS,/stats/courses/programme/2022-02-27/5102,12


In [11]:
P2 = Programme("01-01-2021", "30-12-2021")
P2.programme

Unnamed: 0,date,idHippo,Hippodrome,lien,numReunion
0,2021-10-02,4413,NANTES,/stats/courses/programme/2021-10-02/4413,3
1,2021-10-02,8002,AMIENS,/stats/courses/programme/2021-10-02/8002,7
2,2021-10-02,7101,CLUNY,/stats/courses/programme/2021-10-02/7101,8
3,2021-10-03,4405,CORDEMAIS,/stats/courses/programme/2021-10-03/4405,8
4,2021-10-03,5008,GRAIGNES,/stats/courses/programme/2021-10-03/5008,9
...,...,...,...,...,...
684,2021-01-02,7500,VINCENNES,/stats/courses/programme/2021-01-02/7500,1
685,2021-01-03,7616,MAUQUENCHY,/stats/courses/programme/2021-01-03/7616,4
686,2021-01-03,7500,VINCENNES,/stats/courses/programme/2021-01-03/7500,1
687,2021-01-03,3302,LE BOUSCAT,/stats/courses/programme/2021-01-03/3302,7


In [16]:
P3 = pd.concat([P2.programme,P.programme], axis=0)

In [25]:
P3 = P3[P3["idHippo"] =="7500"]
P3

Unnamed: 0,date,idHippo,Hippodrome,lien,numReunion
11,2021-10-05,7500,VINCENNES,/stats/courses/programme/2021-10-05/7500,4
16,2021-10-08,7500,VINCENNES,/stats/courses/programme/2021-10-08/7500,1
29,2021-10-12,7500,VINCENNES,/stats/courses/programme/2021-10-12/7500,4
33,2021-10-15,7500,VINCENNES,/stats/courses/programme/2021-10-15/7500,1
46,2021-10-19,7500,VINCENNES,/stats/courses/programme/2021-10-19/7500,1
...,...,...,...,...,...
660,2022-02-21,7500,VINCENNES,/stats/courses/programme/2022-02-21/7500,1
662,2022-02-22,7500,VINCENNES,/stats/courses/programme/2022-02-22/7500,1
664,2022-02-24,7500,VINCENNES,/stats/courses/programme/2022-02-24/7500,1
665,2022-02-25,7500,VINCENNES,/stats/courses/programme/2022-02-25/7500,1


In [30]:
class Courses():
    def __init__(self, programme) -> None:
        if isinstance(programme, Programme):
            self.programme = programme.programme
        elif isinstance(programme, pd.DataFrame):
            self.programme = programme
        self.courses = self._get_all_course_in_programme()

    def _get_all_course_in_programme(self):
        courses = []  

        def _request_race(row):
            courses_list = []
            try:
                url = f"https://www.letrot.com/{row['lien']}/json"
                date_pmu = "".join(row["date"].split("-")[::-1])    
                r = requests.get(url, headers=headers)
                courses = r.json()
                for c in courses["course"]:
                    if c["discipline"] == "Attelé":
                        course_id = row["date"].replace("-", "") + str(row["idHippo"]) + str(c["numCourse"])
                        courses_list.append({"date": row["date"], "id": course_id, "numReunion": row["numReunion"], "hippodrome": courses["nomHippodrome"], "idHippo": row["idHippo"],**c})
                return courses_list
            except:
                pass
            
        def gen_rows(df):
            for row in df.itertuples(index=False):
                yield row._asdict()

        with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
            res = executor.map(_request_race, gen_rows(self.programme))
            
            for i in res:
                courses.extend(i)
            
        return pd.DataFrame(courses)

In [31]:
courses = Courses(P3)

In [32]:
courses.courses

Unnamed: 0,date,id,numReunion,hippodrome,idHippo,heureCourse,discipline,numCourse,prix,allocation,distance,categorie,typePiste,conditionsEngagement,hasTracking,hasVideoHeat,statut,classement,linkPrix,replay
0,2021-10-05,2021100575001,4,VINCENNES,7500,16:27,Attelé,1,PRIX MYRRHA,18 000,2 100,Course R,,"Course ""a reclamer""<br />Départ à l'autostart<...",True,True,16,2 - 3 - 14 - 4 - 12,https://www.letrot.com/stats/fiche-course/2021...,"<a href=""https://www.letrot.com/fr/replay-cour..."
1,2021-10-05,2021100575002,4,VINCENNES,7500,17:02,Attelé,2,PRIX DE LA CORREZE,33 000,2 700,Course D,,"Pour poulains entiers et hongres de 4 ans, n'a...",True,False,16,5 - 3 - 1 - 7 - 9,https://www.letrot.com/stats/fiche-course/2021...,"<a href=""https://www.letrot.com/fr/replay-cour..."
2,2021-10-05,2021100575004,4,VINCENNES,7500,18:12,Attelé,4,PRIX KURSE,66 000,2 100,Groupe III,,Course Européenne<br />Départ à l'autostart<br...,True,True,16,6 - 10 - 4 - 8 - 1,https://www.letrot.com/stats/fiche-course/2021...,"<a href=""https://www.letrot.com/fr/replay-cour..."
3,2021-10-05,2021100575005,4,VINCENNES,7500,18:45,Attelé,5,PRIX DE FORMERIE,38 000,2 100,Course C,,Départ à l'autostart<br />Pour pouliches de 3 ...,True,True,16,3 - 8 - 10 - 4 - 2,https://www.letrot.com/stats/fiche-course/2021...,"<a href=""https://www.letrot.com/fr/replay-cour..."
4,2021-10-05,2021100575006,4,VINCENNES,7500,19:15,Attelé,6,PRIX CONSTANTIA,11 000,2 850,Course G,,"Amateurs<br />Pour 6, 7 et 8 ans, n'ayant pas ...",True,True,16,3 - 10 - 9 - 12 - 6,https://www.letrot.com/stats/fiche-course/2021...,"<a href=""https://www.letrot.com/fr/replay-cour..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1654,2022-02-27,2022022775004,1,VINCENNES,7500,15:15,Attelé,4,PRIX AMERIQUE RACES ZETURF PX DE PARIS,400 000,4 150,Groupe I,,Groupe I<br />Course Internationale<br />Pour ...,True,False,16,9 - 12 - 7 - 1 - 11,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."
1655,2022-02-27,2022022775005,1,VINCENNES,7500,15:50,Attelé,5,PRIX AGRIFOURNITURES.FR,65 000,2 850,Course C,,"Pour 7 à 11 ans inclus (F à B), n'ayant pas ga...",True,False,16,9 - 16 - 10 - 7 - 13,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."
1656,2022-02-27,2022022775007,1,VINCENNES,7500,17:00,Attelé,7,PRIX DE LA MAYENNE,90 000,2 700,Groupe III,,Course Européenne<br />Pour 5 et 6 ans (H et G...,True,False,16,5 - 9 - 10 - 15 - 6,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."
1657,2022-02-27,2022022775008,1,VINCENNES,7500,17:35,Attelé,8,PRIX DE BOIS LE ROI,39 000,2 100,Course D,,Départ à l'autostart<br />Pour pouliches de 4 ...,True,False,16,6 - 2 - 13 - 10 - 14,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."


In [103]:
courses_groupes = courses.courses[courses.courses["categorie"].isin(['Groupe I','Groupe II','Groupe III'])]

In [33]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler,StandardScaler

class DataError(ValueError): pass

class Partants():
    def __init__(self, course, training=True):
        
        to_scale = ['nombreCourses',
                    'nombreVictoires',
                    'nombrePlaces',
                    'nombrePlacesSecond',
                    'nombrePlacesTroisieme',
                    'gainsParticipant_gainsCarriere',
                    'gainsParticipant_gainsVictoires',
                    'gainsParticipant_gainsPlace',
                    'gainsParticipant_gainsAnneeEnCours',
                    'gainsParticipant_gainsAnneePrecedente',
                    'nbDiscalifieMusic',
                    'nbVictoireMusic',
                    'nbPlaceMusic',
                    'prefered_dist',
                    'distToPreferedDist',
                    'meanReduction',
                    'medianReduction',
                    'maxReduction',
                    'minReduction',
                    'timeSinceRecord',
                    'tpsLastRace',
                    'nbArriveMusic',
                    'recordAbs',
                    'nbCourseCouple',
                    'nbVictoiresCouple',
                    'nb2emeCouple',
                    'nb3emeCouple',
                    'txReussiteCouple',
                    'nbCourseHippo',
                    'nbVictoiresHippo',
                    'nb2emeHippo',
                    'nb3emeHippo',
                    'txReussiteHippo']
        
        self.course = course
        self.courseId = course["id"]
        self.date = course["date"]
        self.idHippo = course["idHippo"]
        self.numCourse = course["numCourse"]
        self.numReunion = course["numReunion"]
        
        
        self.distance = int(course["distance"].replace(" ", ""))
        self.categorie = course["categorie"].split(" ")[1]
        
        self.training = training
        self.classement = None
        
        scaler = StandardScaler()
        
        try:
            self.info_partants = self._info_tableau_partant()
            df = pd.DataFrame(self.info_partants)
            df.loc[:, df.columns.isin(to_scale)] = scaler.fit_transform(df.loc[:, df.columns.isin(to_scale)].to_numpy())
            
            self.info_partants = df.to_dict('records')
            
        except:
            self.info_partants = None
        
        
    def _request_tableau_partants(self):
        r = get_request_with_session(f"https://www.letrot.com/stats/fiche-course/{self.date}/{self.idHippo}/{self.numCourse}/partants/tableau")
        soup = bs(r.text, "html.parser")
        headers_table = soup.find("table", {"id": "result_table"}).find("thead").find("tr").find_all("th")
        table = soup.find("table", {"id": "result_table"}).find("tbody")
        rows = table.find_all("tr")
        return rows, headers_table
    
    def _request_tableau_arrive(self):
        r = get_request_with_session(f"https://www.letrot.com/stats/fiche-course/{self.date}/{self.idHippo}/{self.numCourse}/resultats/arrivee-definitive")
        soup = bs(r.text, "html.parser")
        headers_table = soup.find("table", {"id": "result_table"}).find("thead").find("tr").find_all("th")
        table = soup.find("table", {"id": "result_table"}).find("tbody")
        rows = table.find_all("tr")
        
        classement = {row.select("td")[1].text : row.select("td")[0].find("span", {"class": "bold"}).text for row in rows}       
        self.classement = classement
        return rows,classement
    
    def _request_partant_pmu(self):
        date_pmu = "".join(self.date.split("-")[::-1])  
        participants_pmu = get_request_with_session(f"https://online.turfinfo.api.pmu.fr/rest/client/65/programme/{date_pmu}/R{self.numReunion}/C{self.numCourse}/participants")
        try:
            pmu_jsoned = participants_pmu.json()["participants"]
            participants = pd.json_normalize(pmu_jsoned, sep="_").to_dict(orient="records")
            participants_with_id = [dict(item, **{"id": self.courseId, "numReunion": self.numReunion}) for item in participants]  
            return participants_with_id
        except:
            raise Exception("Erreur API PMU")
            
        
        
    def _info_tableau_partant(self):
        chevaux = []
        
        try:
            tableau_partants, headers_table = self._request_tableau_partants()
            tableau_arrivee,classement = self._request_tableau_arrive()
            tableau_pmu = self._request_partant_pmu()
        except:
            return None

        info_couple = self.get_info_couple()
        info_chevaux_hippo = self.get_info_cheval_hippo()
    
        chevaux.extend(tableau_pmu)
        
        for i,row in enumerate(tableau_partants):
                num = row.select("td")[0].find("span", {"class": "bold"}).text
                col = row.select("td")
                cheval = {}
                cheval["num"] = num
                cheval["nom"] = col[1].text
                
                
                
                cheval["numCoursePMU"] = f"R{self.numReunion}C{self.numCourse}"

                if self.training:
                    if num == "NP":
                        cheval["classement"] = "NP"
                    else:
                        cheval["classement"] = classement[num]
                cheval["id"] = self.courseId
                cheval["date"] = self.date
                cheval["url"] = col[1].find("a").get("href")


                cheval["fer"] = int(col[3].text) if col[3].text else 0
                cheval["firstTimeFer"] = 1 if col[3].find("div", {"class", "fer-first-time"}) else 0
                cheval["sex"] = 0 if col[4].text == "M" else 1
                cheval["age"] = int(col[5].text)
                cheval["dist"] = int(col[6].text.replace(" ", "").replace("\n", ""))
                cheval["driver"] = col[7].find("a").get("href")
                cheval["trainer"] = col[8].find("a").get("href")

                if "Avis" in headers_table[9].text:
                    cheval["avisTrainer"] = int(col[9].get("data-order"))
                    avis = col.pop(9)
                    col.insert(-1, avis)
                else:
                    cheval["avisTrainer"] = 2

                cheval["music"] = list(filter(lambda x: "a" in x, col[9].text.replace("D", "0").replace("Ret", "0").replace("T", "0").split()))
                cheval["music"] = list(map(lambda x: x[0], cheval["music"]))

                cheval["music"] = list(filter(lambda x: x.isnumeric(), cheval["music"]))

                cheval["music"] = list(map(int, cheval["music"]))

                cheval["nbDiscalifieMusic"] = cheval["music"].count(0)
                cheval["nbVictoireMusic"] = cheval["music"].count(1)
                cheval["nbPlaceMusic"] = sum(map(lambda x : x <=3 and x > 0,cheval["music"]))
                
                
                if len(cheval["music"]) < 4:
                    raise DataError("not enough data")
                    
                cheval.update(self.get_info_cheval(cheval["url"], self.date,cheval["driver"]))
                try:
                    cheval.update(self.get_tracking(cheval["url"]))
                except:
                    pass
                cheval.update(info_couple[i])
                cheval.update(info_chevaux_hippo[i])

                cheval["formeVictoire"] = 1 if cheval["nbVictoireMusic"]/len(cheval["music"]) > 0.33 else 0
                cheval["formePlace"] = 1 if cheval["nbPlaceMusic"]/len(cheval["music"]) > 0.33 else 0
                    
                cheval["nbArriveMusic"] = len(cheval["music"]) - cheval["music"].count(0)
                cheval["lastPerf"] = cheval["music"][0] if cheval["nbArriveMusic"] else 0

                arriveOnly = list(filter(None, cheval["music"]))
                if len(arriveOnly) > 0:
                    try:
                        cheval["meanPerf"] = np.mean(arriveOnly)
                        cheval["medianPerf"] = np.median(arriveOnly)
                        cheval["modePerf"] = st.mode(cheval["music"])
                    except:
                        cheval["meanPerf"] = 0
                        cheval["medianPerf"] = 0
                        cheval["modePerf"] = 0
                else:
                    cheval["meanPerf"] = 0
                    cheval["medianPerf"] = 0
                    cheval["modePerf"] = 0

                try:
                    cheval["recordAbs"] = list(map(int, col[10].text.replace(col[10].span.text, "").replace("\'", '"').split('"')))
                    cheval["recordAbs"] = cheval["recordAbs"][0] * 10 * 60 + cheval["recordAbs"][1] * 10 + cheval["recordAbs"][2]
                except:
                    cheval["recordAbs"] = None

                cheval["gain"] = int(col[11].find("div", class_="gains").text.replace(" ", "")[:-1])
                
                chevaux[i].update(cheval)
        return chevaux
    
    def get_info_cheval(self, url, date, driver):
        r = requests.get(url + "-paginate-2", headers=headers)
        date_debut = datetime.date.fromisoformat(date)
        jsoned = r.json()["data"]

        info_dict = {}

        for c in jsoned:
            c["dateCourse"] = datetime.date.fromisoformat(c["dateCourseRaw"])
            c["categorie"] = bs(c["categorie"], "html.parser").find("span").text
            try:
            
                c["driver"] = bs(c["nomDriver"], "html.parser").find("a").get("href").split("/")[-3] == driver.split("/")[-3]
            except:
                c["driver"] = 0
            reduction = bs(c["reduction"], "html.parser").span.text
            reduction = reduction.replace("'", "").replace('\"', "")
            try:
                c["allocation"] = int(bs(c["reduction"], "html.parser").span.text.lstrip("0"))
            except:
                c["allocation"] = 0
            

            c["distance"] = int(c["distance"].replace(" ", "")) if c["distance"] != None else None

            reduction_min = int(str(reduction)[0])
            reduction_sec = int(str(reduction)[1:3])
            reduction_ssec = int(str(reduction)[3])

            c["reduction"] = reduction_min*60*10 + reduction_sec*10 + reduction_ssec

            # c["recordAbs"] = list(map(int, reduction.text.replace(reduction.span.text, "").replace("\'", '"').split('"')))
            # c["recordAbs"] = c["recordAbs"][0] * 10 * 60 + c["recordAbs"][1] * 10 + c["recordAbs"][2]

        filtered = list(filter(lambda x: x["dateCourse"] < date_debut and x["specialite"] == "A", jsoned))

        if len(filtered) == 0:
            raise DataError("Not enough Data")
        
        perc_jockey = sum([x["driver"] for x in filtered])/len(filtered)
        info_dict["jockeyHabitude"] = 1 if perc_jockey > 0.5 else 0
        
        last_30_days = list(filter(lambda x: x["dateCourse"] > date_debut - datetime.timedelta(days=30) and x["specialite"] == "A", jsoned))
        
        weights = [ max(x["allocation"], 1) for x in filtered if x["distance"] != None  ]
        if sum(weights) == 0:
            raise DataError("Not enough Data")
#         print(weights)
        prefered_dist = int(np.average([ x["distance"] for x in filtered if x["distance"] != None ], weights=weights))
        
        info_dict["prefered_dist"] = prefered_dist
        info_dict["distToPreferedDist"] = abs(self.distance - prefered_dist)
        
        info_dict["changementCategorie"] = 1 if filtered[0]["categorie"] != self.categorie else 0
        
        if len(last_30_days) > 0:
            dist_30_days = np.array([x["distance"] for x in last_30_days])
            info_dict["newDist"] = 1 if np.max(dist_30_days - self.distance) > 200 else 0
        else:
            info_dict["newDist"] = 1
            
        
        filtered_tps = list(filter(lambda x: x["reduction"] < 1200, filtered))
        
        dps_race = [(date_debut - x["dateCourse"]).days for x in filtered_tps]
        tps = [x["reduction"] for x in filtered_tps]
        
        if len(tps) > 0:
            info_dict["meanReduction"] = np.mean(tps)
            info_dict["medianReduction"] = np.median(tps)
            info_dict["maxReduction"] = max(tps)
            info_dict["minReduction"] = min(tps)
            
            lin_reg = LinearRegression().fit(np.array(dps_race).reshape(-1, 1), np.array(tps).reshape(-1, 1))
            info_dict["progressTps"] = np.exp(lin_reg.coef_[0][0])
            
        else:
            info_dict["meanReduction"] = 0
            info_dict["medianReduction"] = 0
            info_dict["maxReduction"] = 0
            info_dict["minReduction"] = 0
            info_dict["progressTps"] = 0

        info_dict["timeSinceRecord"] = next(((date_debut - item["dateCourse"]).days for item in filtered_tps if item["reduction"] == info_dict["minReduction"]), 365)

        info_dict["tpsLastRace"] = (date_debut - filtered[0]["dateCourse"]).days

        info_dict["last_race_dist"] = filtered[0]["distance"]
        info_dict["rentree"] = 1 if info_dict["tpsLastRace"] > 30 else 0

        return info_dict
    
    
    def get_info_couple(self):
        couple_info = []

        date = datetime.date.fromisoformat(self.date)
        
        d = datetime.timedelta(days=1)
        d2 = datetime.timedelta(days=365)

        date_arrive = (date - d).strftime("%d-%m-%Y").replace("-", "%2F")
        date_depart = (date  - d2).strftime("%d-%m-%Y").replace("-", "%2F")

        url = f"https://www.letrot.com/stats/fiche-course/{self.date}/{self.idHippo}/{self.numCourse}/partants/couples/paginate?datepicker_du={date_depart}&datepicker_au={date_arrive}"
        r = requests.get(url, headers=headers)
        dic_json = r.json()
        data = dic_json["data"]

        data_sorted = sorted(data, key=lambda x: x["numero"])
        for couple in data_sorted:
            cheval = {}

            cheval["nbCourseCouple"] = int(bs(couple["nbre_courses"], "html.parser").find("div").text)
            cheval["nbVictoiresCouple"] = int(bs(couple["nbre_victoires"], "html.parser").find("div").text)
            cheval["nb2emeCouple"] = int(bs(couple["nbre_2eme"], "html.parser").find("div").text)
            cheval["nb3emeCouple"] = int(bs(couple["nbre_3eme"], "html.parser").find("div").text)
            cheval["txReussiteCouple"] = int(couple["taux_reussite_sort"])/100
            try:
                cheval["txVictoireCouple"] = cheval["nbVictoiresCouple"] / cheval["nbCourseCouple"]
            except:
                cheval["txVictoireCouple"] = 0.0
            cheval["nonPartant"] = couple["nonPartant"]
            cheval["moreFirstThanThirdCouple"] = 1 if cheval["nbVictoiresCouple"] > cheval["nb3emeCouple"] + cheval["nb2emeCouple"] else 0
            couple_info.append(cheval)
        return couple_info
    
    def get_info_cheval_hippo(self):
        couple_info = []

        date = datetime.date.fromisoformat(self.date)
        
        d = datetime.timedelta(days=1)
        d2 = datetime.timedelta(days=365)

        date_arrive = (date - d).strftime("%d-%m-%Y").replace("-", "%2F")
        date_depart = (date  - d2).strftime("%d-%m-%Y").replace("-", "%2F")

        url = f"https://www.letrot.com/stats/fiche-course/{self.date}/{self.idHippo}/{self.numCourse}/partants/chevaux/paginate?numHippodrome={self.idHippo}&piste=all&datepicker_du={date_depart}&datepicker_au={date_arrive}"
        r = requests.get(url, headers=headers)
        dic_json = r.json()
        data = dic_json["data"]

        data_sorted = sorted(data, key=lambda x: x["numero"])
        for couple in data_sorted:
            cheval = {}

            cheval["nbCourseHippo"] = int(bs(couple["nbre_courses"], "html.parser").find("div").text)
            cheval["nbVictoiresHippo"] = int(bs(couple["nbre_victoires"], "html.parser").find("div").text)
            cheval["nb2emeHippo"] = int(bs(couple["nbre_2eme"], "html.parser").find("div").text)
            cheval["nb3emeHippo"] = int(bs(couple["nbre_3eme"], "html.parser").find("div").text)
            try:
                cheval["txVictoireHippo"] = cheval["nbVictoiresHippo"] / cheval["nbCourseHippo"]
            except:
                cheval["txVictoireHippo"] = 0.0
            try:
                cheval["txReussiteHippo"] =int(couple["taux_reussite_sort"])/100
            except:
                cheval["txReussiteHippo"] = 0.0
                                              
            cheval["perfHippo"] = 1 if cheval["txReussiteHippo"] > 0.5 and cheval["nbCourseHippo"] > 5 else 0
            couple_info.append(cheval)
        return couple_info
    
    def get_tracking(self, url):
        r = requests.get(url.replace("dernieres-performances", "tracking"), headers=headers)
        soup = bs(r.text, "html.parser")
        headers_table = soup.find("table", {"id": "result_table"}).find("thead").find("tr").find_all("th")
        table = soup.find("table", {"id": "result_table"}).find("tbody")
        rows = table.find_all("tr")
        
        info_tracking = {}
        
        distance_au_premier_arrivee = []
        accélération_500m = []
        gain_classement_500m = []
        for row in rows:
            dist_prem = int(row.find_all("td")[2].span.text)
            if dist_prem < 9999:
                distance_au_premier_arrivee.append(dist_prem)
            
            pre_fin = int(row.find_all("td")[17].span.text)
            fin = int(row.find_all("td")[18].span.text)
            if pre_fin < 2000 and fin < 2000:                                            
                accélération_500m.append(pre_fin - fin)
                

            try:
                class_500m = int(row.find_all("td")[16].span.text)
                class_final = int(row.find_all("td")[1].find("span", {"class": "bold"}).text)
                if class_500m -  class_final < 10:
                    gain_classement_500m.append(class_500m -  class_final)
            except:
                gain_classement_500m.append(0)
            
        info_tracking["mean_dist_arrivee"] = np.mean(distance_au_premier_arrivee) if len(distance_au_premier_arrivee) > 0 else np.nan
        info_tracking["acceleration_500m"] = np.mean(accélération_500m) if len(accélération_500m) > 0 else np.nan
        info_tracking["gain_classement_fin"] = np.mean(gain_classement_500m) if len(gain_classement_500m) > 0 else np.nan
        return info_tracking

In [34]:
info = []
t = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    res = executor.map(Partants, gen_rows(courses.courses))
    for i in res:
        if isinstance(i.info_partants, list):
            info.extend(i.info_partants)
#             pd.DataFrame(i.info_partants).to_csv("data_testx.csv",mode="a", header=not os.path.isfile("data_testx.csv") ,index=False)
print("Fini en:",time.time() - t, "secondes")

Fini en: 1199.3148834705353 secondes


In [63]:
info_2 = []
t = time.time()

for row in gen_rows(courses.courses):
    res = Partants(row,session)
    print(res.info_partants)
    if isinstance(res.info_partants, list):
        info_2.extend(res.info_partants)
print(info_2)

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


  pmu_jsoned = await resp.json()["participants"]
  pmu_jsoned = await resp.json()["participants"]


None
None
None
None
None
None
None
None
[]


RUN SANS ASYNCIO: 37.02154350280762 secondes

In [35]:
df = pd.DataFrame(info)

In [36]:
len(df["id"].unique())

680

In [37]:
df.to_csv("data/vincenne.csv")

In [38]:
df[["mean_dist_arrivee", "acceleration_500m","gain_classement_fin"]]

Unnamed: 0,mean_dist_arrivee,acceleration_500m,gain_classement_fin
0,42.000000,2.750000,-0.750000
1,30.500000,7.000000,1.000000
2,26.750000,0.250000,2.250000
3,8.000000,8.333333,0.333333
4,38.500000,18.500000,-1.666667
...,...,...,...
8739,18.285714,8.857143,1.142857
8740,15.000000,-7.800000,-1.111111
8741,17.000000,3.000000,1.222222
8742,18.125000,9.125000,1.300000


In [51]:
df

Unnamed: 0,nom,numPmu,age,sexe,race,statut,oeilleres,proprietaire,entraineur,deferre,...,nbArriveMusic,lastPerf,meanPerf,medianPerf,modePerf,recordAbs,gain,placeCorde,ecurie,poidsConditionMonte
0,ALWAYS EK,1,6,MALES,TROTTEUR ETRANGER,PARTANT,SANS_OEILLERES,Scuderia EFFEBI (ITY),F. SOULOY,DEFERRE_ANTERIEURS_POSTERIEURS,...,0.125000,0,2.750000,1.5,0,0.100000,322094,,,
1,CHARLY DE L'AUNAY,2,10,HONGRES,TROTTEUR FRANCAIS,PARTANT,SANS_OEILLERES,Christophe KEERHEM,J. GUELPA,DEFERRE_ANTERIEURS_POSTERIEURS,...,0.750000,8,4.111111,3.0,3,1.000000,359150,,,
2,MARCELLO WIBB,3,7,HONGRES,TROTTEUR ETRANGER,PARTANT,SANS_OEILLERES,Ecurie BLACK AND WHITE (BEL),V. MARTENS,DEFERRE_ANTERIEURS_POSTERIEURS,...,0.375000,3,5.000000,5.0,3,0.250000,399909,,,
3,FINE COLLINE,4,7,FEMELLES,TROTTEUR FRANCAIS,PARTANT,SANS_OEILLERES,Ecurie LD-M. ABRIVARD,M. ABRIVARD,DEFERRE_ANTERIEURS,...,0.500000,0,2.285714,1.0,0,0.000000,412960,,,
4,ELITE DE JIEL,5,8,FEMELLES,TROTTEUR FRANCAIS,PARTANT,SANS_OEILLERES,Ecurie LUCK,J.L. DERSOIR,PROTEGE_ANTERIEURS_DEFERRRE_POSTERIEURS,...,0.625000,9,4.125000,3.0,0,0.350000,413440,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,HELLO JADE RUSH,7,5,FEMELLES,TROTTEUR FRANCAIS,PARTANT,SANS_OEILLERES,Ecurie D L,M. DONIO,DEFERRE_POSTERIEURS,...,0.833333,4,3.444444,4.0,1,0.973684,81330,,,
83,HERCULE MADRIK,8,5,HONGRES,TROTTEUR FRANCAIS,PARTANT,SANS_OEILLERES,R. BAHIER,J.M. BAUDOUIN,DEFERRE_ANTERIEURS_POSTERIEURS,...,0.666667,0,3.750000,3.0,0,0.552632,86850,,,
84,HOUSTON DISA,9,5,HONGRES,TROTTEUR FRANCAIS,PARTANT,SANS_OEILLERES,Y. GAUTIER,Y. GAUTIER,DEFERRE_ANTERIEURS,...,0.666667,1,2.750000,2.0,2,0.605263,110980,,,
85,HELIOS DES ARCS,10,5,HONGRES,TROTTEUR FRANCAIS,PARTANT,SANS_OEILLERES,Mme M.J. MARQUET,V. GOUIN,PROTEGE_ANTERIEURS_DEFERRRE_POSTERIEURS,...,0.833333,4,3.666667,3.0,4,0.710526,130820,,,


In [4]:
r = requests.get("https://www.letrot.com/stats/fiche-cheval/fee-de-ranchval/ZWdbYwUFBAcM/courses/tracking", headers=headers)
soup = bs(r.text, "html.parser")

In [5]:
headers_table = soup.find("table", {"id": "result_table"}).find("thead").find("tr").find_all("th")
table = soup.find("table", {"id": "result_table"}).find("tbody")
rows = table.find_all("tr")

In [70]:
courses.courses.head()

Unnamed: 0,date,id,numReunion,hippodrome,idHippo,heureCourse,discipline,numCourse,prix,allocation,distance,categorie,typePiste,conditionsEngagement,hasTracking,hasVideoHeat,statut,classement,linkPrix,replay
0,2022-08-27,2022082775001,4,VINCENNES,7500,13:40,Attelé,1,PRIX DE SAINT-MANDE,12 000,2 100,Course F,,Amateurs<br />Départ à l'autostart<br />Pour 6...,True,False,16,7 - 9 - 4 - 3 - 2,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."
1,2022-08-27,2022082775002,4,VINCENNES,7500,14:15,Attelé,2,PRIX DE MULHOUSE,39 000,2 700,Course E,,"Pour chevaux entiers et hongres de 5 ans, n'ay...",True,False,16,6 - 15 - 11 - 5 - 12,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."
2,2022-08-27,2022082775003,4,VINCENNES,7500,14:50,Attelé,3,PRIX DE DOLE,39 000,2 100,Course D,,Départ à l'autostart<br />Pour pouliches de 4 ...,True,False,16,4 - 2 - 1 - 11 - 9,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."
3,2022-08-27,2022082775005,4,VINCENNES,7500,16:18,Attelé,5,PRIX DE LURY SUR ARNON,51 000,2 100,Course B,,Course Européenne<br />Départ à l'autostart<br...,True,False,16,1 - 6 - 4 - 9 - 13,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."
4,2022-08-27,2022082775006,4,VINCENNES,7500,16:53,Attelé,6,YEARLING CUP 2022 - ARQANA TROT,150 000,2 700,Groupe III,,Pour 3 ans.<br />Sont seuls admis à participer...,True,True,16,6 - 11 - 9 - 10 - 3,https://www.letrot.com/stats/fiche-course/2022...,"<a href=""https://www.letrot.com/fr/replay-cour..."


In [121]:
def get_rapport_course(i):
    date = datetime.date.fromisoformat(i["date"])
    
    date_pmu = date.strftime("%d%m%Y")
    r = requests.get(f"https://online.turfinfo.api.pmu.fr/rest/client/61/programme/{date_pmu}/R{i['numReunion']}/C{i['numCourse']}/rapports-definitifs?specialisation=INTERNET&combinaisonEnTableau=true", headers=headers)
    dict_bien = {}
    try:
        rjson = r.json()
    except:
        return None
    for j in rjson:
        if i["id"] in dict_bien:
            dict_bien[i["id"]].update({j["typePari"]: j})
        else:
            dict_bien[i["id"]] = {j["typePari"]: j}
    return dict_bien

In [122]:
all_rapport_dict = {}

with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    res = executor.map(get_rapport_course, gen_rows(courses.courses))
    for i in res:
        if isinstance(i, dict):
            all_rapport_dict.update(i)

In [132]:
get_rapport_course(courses.courses.iloc[0])["2022082775001"]["E_SIMPLE_GAGNANT"]["rapports"]

[{'libelle': 'e-Simple Gagnant',
  'dividende': 220,
  'dividendePourUnEuro': 220,
  'combinaison': [7],
  'nombreGagnants': 5770.0,
  'dividendePourUneMiseDeBase': 220,
  'dividendeUnite': 'PourUnEuro'}]

In [125]:
with open("data/rapport.json", "w") as f:
    json.dump(all_rapport_dict, f, indent=4)