### Import

In [93]:
from sqlalchemy import create_engine
import os
import yaml
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


### Read Config

In [94]:
base = Path().resolve()
db_conf_file = os.path.normpath(os.path.join(base, "../conf/db_connection.yaml"))
with open(db_conf_file, "r", encoding="utf-8") as l_file:
    db_conf_all = yaml.safe_load(l_file)

### Create Engine

In [95]:
ora_conf = db_conf_all["oracle"]
engine = create_engine(f"oracle://{ora_conf['username']}:{ora_conf['password']}@{ora_conf['dsn']}/?encoding=UTF-8&nencoding=UTF-8")

### Condition

In [96]:
c_cource_id = "06_t1200_out"

### Query

In [97]:
from sqlalchemy.sql import select
from sqlalchemy.sql import join

from model.race import Race
from keibaai.model.race import RaceUma

query_statement = (
    select(
        join(Race, RaceUma, Race.id == RaceUma.race_id)
        ).where(
            Race.course_id == c_cource_id
        )
)
print(query_statement)

SELECT races.id, races.name, races.race_track_id, races.kai, races.nichi, races.race_no, races.course_id, races.grade_id, races.is_win5, races.condition, races.handicap, races.race_date, races.race_start, races.weather, races.going, races.num_of_horses, races.race_data, races.corner_order_1, races.corner_order_2, races.corner_order_3, races.corner_order_4, races.pace, race_uma.race_id, race_uma.uma_id, race_uma.result, race_uma.bracket_number, race_uma.horse_number, race_uma.gender, race_uma.age, race_uma.weight_to_carry, race_uma.jockey_id, race_uma.time, race_uma.margin, race_uma.ninki, race_uma.win_odds, race_uma.final_3_furlong, race_uma.corner_order, race_uma.trainer_id, race_uma.horse_weight, race_uma.gain_and_loss_weight, race_uma.is_excluded, race_uma.is_demoted 
FROM races JOIN race_uma ON races.id = race_uma.race_id 
WHERE races.course_id = :course_id_1


In [98]:
query_statement = (
    select(
        Race.race_track_id, Race.kai, Race.nichi, Race.race_no, Race.grade_id, Race.num_of_horses, RaceUma.result, RaceUma.bracket_number, RaceUma.horse_number, RaceUma.age, RaceUma.weight_to_carry, RaceUma.time, RaceUma.ninki, RaceUma.win_odds, RaceUma.final_3_furlong, RaceUma.horse_weight, RaceUma.gain_and_loss_weight
    ).select_from(
        join(Race, RaceUma, Race.id == RaceUma.race_id)
    ).where(
        Race.course_id == c_cource_id
    )
)
print(query_statement)

SELECT races.race_track_id, races.kai, races.nichi, races.race_no, races.grade_id, races.num_of_horses, race_uma.result, race_uma.bracket_number, race_uma.horse_number, race_uma.age, race_uma.weight_to_carry, race_uma.time, race_uma.ninki, race_uma.win_odds, race_uma.final_3_furlong, race_uma.horse_weight, race_uma.gain_and_loss_weight 
FROM races JOIN race_uma ON races.id = race_uma.race_id 
WHERE races.course_id = :course_id_1


### Read Data

In [99]:
data = pd.read_sql_query(sql=query_statement, con=engine)
data

Unnamed: 0,race_track_id,kai,nichi,race_no,grade_id,num_of_horses,result,bracket_number,horse_number,age,weight_to_carry,time,ninki,win_odds,final_3_furlong,horse_weight,gain_and_loss_weight
0,60,50,20,9,16,15,1.0,5,9,3,56.0,67.4,6,10.7,33.8,468,8
1,60,50,20,9,16,15,2.0,3,5,3,54.0,67.6,1,2.8,34.3,470,0
2,60,50,20,9,16,15,3.0,6,11,4,57.0,67.8,2,4.6,34.0,496,2
3,60,50,20,9,16,15,4.0,5,8,5,57.0,67.9,3,7.2,34.5,540,2
4,60,50,20,9,16,15,5.0,2,2,6,55.0,67.9,8,17.3,34.1,518,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3353,60,30,20,6,9,13,9.0,4,4,3,56.0,71.5,10,57.6,36.3,446,8
3354,60,30,20,6,9,13,10.0,5,7,3,54.0,71.9,12,151.1,37.0,462,4
3355,60,30,20,6,9,13,11.0,4,5,3,51.0,71.9,11,104.1,36.9,430,-2
3356,60,30,20,6,9,13,12.0,1,1,3,51.0,72.1,13,260.6,36.5,408,4


### Describe

In [100]:
data.describe()

Unnamed: 0,race_track_id,kai,nichi,race_no,grade_id,num_of_horses,result,bracket_number,horse_number,age,weight_to_carry,time,ninki,win_odds,final_3_furlong,horse_weight,gain_and_loss_weight
count,3358.0,3358.0,3358.0,3358.0,3358.0,3358.0,3350.0,3358.0,3358.0,3358.0,3358.0,3350.0,3358.0,3358.0,3350.0,3358.0,3358.0
mean,60.0,34.142347,48.412448,3.348422,15.727516,15.212031,8.088657,4.624777,8.106015,3.867778,54.533353,69.782776,8.106015,62.643895,35.144716,463.31626,0.914532
std,0.0,13.010845,24.569488,2.995198,9.848203,1.577039,4.470689,2.27347,4.476073,1.783091,1.624742,1.36375,4.476073,93.612983,1.115313,31.045942,6.173552
min,60.0,10.0,10.0,0.0,1.0,8.0,1.0,1.0,1.0,2.0,50.0,66.8,1.0,1.4,32.4,362.0,-26.0
25%,60.0,20.0,30.0,1.0,7.0,15.0,4.0,3.0,4.0,2.0,54.0,68.9,4.0,9.2,34.4,442.0,-2.0
50%,60.0,40.0,50.0,2.0,16.0,16.0,8.0,5.0,8.0,4.0,54.0,69.6,8.0,24.05,35.1,464.0,0.0
75%,60.0,40.0,70.0,6.0,30.0,16.0,12.0,7.0,12.0,5.0,56.0,70.5,12.0,71.3,35.8,486.0,4.0
max,60.0,50.0,91.0,9.0,31.0,16.0,16.0,8.0,16.0,11.0,59.0,85.8,16.0,732.3,52.3,564.0,32.0


### Standardization

In [101]:
data_st = data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(data_st)
data_st = pd.DataFrame(df_scaled, columns=data_st.columns)
data_st.head()

Unnamed: 0,race_track_id,kai,nichi,race_no,grade_id,num_of_horses,result,bracket_number,horse_number,age,weight_to_carry,time,ninki,win_odds,final_3_furlong,horse_weight,gain_and_loss_weight
0,0.0,1.218984,-1.156584,1.887161,0.027672,-0.134469,-1.585821,0.165069,0.199755,-0.486743,0.90283,-1.747485,-0.470575,-0.554962,-1.205865,0.150887,1.147884
1,0.0,1.218984,-1.156584,1.887161,0.027672,-0.134469,-1.362109,-0.714774,-0.694019,-0.486743,-0.328318,-1.600808,-1.587792,-0.639364,-0.757493,0.215318,-0.148159
2,0.0,1.218984,-1.156584,1.887161,0.027672,-0.134469,-1.138396,0.604991,0.646642,0.074164,1.518404,-1.454132,-1.364349,-0.620133,-1.026516,1.052911,0.175852
3,0.0,1.218984,-1.156584,1.887161,0.027672,-0.134469,-0.914684,0.165069,-0.023688,0.635071,1.518404,-1.380794,-1.140905,-0.592355,-0.578145,2.470376,0.175852
4,0.0,1.218984,-1.156584,1.887161,0.027672,-0.134469,-0.690971,-1.154696,-1.364349,1.195979,0.287256,-1.380794,-0.023688,-0.484448,-0.936842,1.761644,0.499863


### Correlation

In [102]:
corr_matrix = data.corr()
corr_y = pd.DataFrame({"features":data.columns, "corr_y":corr_matrix["result"]}, index=None)
corr_y = corr_y.reset_index(drop=True)
corr_y.style.background_gradient()

Unnamed: 0,features,corr_y
0,race_track_id,
1,kai,0.006991
2,nichi,0.017119
3,race_no,-0.032299
4,grade_id,0.031521
5,num_of_horses,0.176811
6,result,1.0
7,bracket_number,0.033867
8,horse_number,0.07844
9,age,0.079192


In [103]:
data.corr()

Unnamed: 0,race_track_id,kai,nichi,race_no,grade_id,num_of_horses,result,bracket_number,horse_number,age,weight_to_carry,time,ninki,win_odds,final_3_furlong,horse_weight,gain_and_loss_weight
race_track_id,,,,,,,,,,,,,,,,,
kai,,1.0,0.15715,-0.037505,0.390126,0.047483,0.006991,-0.004941,0.008365,-0.414361,-0.20389,0.139656,0.008365,0.065716,0.07069,-0.227808,0.0175
nichi,,0.15715,1.0,0.032417,-0.074144,0.097686,0.017119,-0.009857,0.017209,-0.061365,-0.02278,0.033187,0.017209,0.026398,0.015967,-0.001781,-0.026932
race_no,,-0.037505,0.032417,1.0,0.006865,-0.195629,-0.032299,0.017586,-0.034463,-0.315655,-0.001593,0.204551,-0.034463,0.061117,0.17399,-0.146564,0.000306
grade_id,,0.390126,-0.074144,0.006865,1.0,0.185049,0.031521,-0.014826,0.032599,-0.606245,-0.347354,0.523715,0.032599,0.087523,0.357795,-0.459152,0.040832
num_of_horses,,0.047483,0.097686,-0.195629,0.185049,1.0,0.176811,-0.083652,0.176163,-0.024138,-0.098557,0.044845,0.176163,0.060132,0.059799,-0.047366,0.047757
result,,0.006991,0.017119,-0.032299,0.031521,0.176811,1.0,0.033867,0.07844,0.079192,-0.074334,0.410496,0.545583,0.442613,0.346037,-0.054454,0.012815
bracket_number,,-0.004941,-0.009857,0.017586,-0.014826,-0.083652,0.033867,1.0,0.953164,0.003998,0.011695,0.013868,0.041057,0.048141,0.002444,-0.007527,-0.0067
horse_number,,0.008365,0.017209,-0.034463,0.032599,0.176163,0.07844,0.953164,1.0,-0.003692,-0.014741,0.024649,0.08386,0.062737,0.017045,-0.018608,0.007055
age,,-0.414361,-0.061365,-0.315655,-0.606245,-0.024138,0.079192,0.003998,-0.003692,1.0,0.38968,-0.424393,0.167658,-0.006643,-0.346966,0.488648,-0.000377


### Model

In [106]:
from sklearn.model_selection import train_test_split
train_X = data_st[["bracket_number", "horse_number", "weight_to_carry", "win_odds", "horse_weight"]]
train_y = data_st["result"]
(train_X, test_X, train_y, test_y) = train_test_split(train_X, train_y, test_size=0.25, random_state=666)

### 重回帰

In [107]:
from sklearn.linear_model import LinearRegression
model_LR = LinearRegression()
model_LR.fit(train_X, train_y)
LR_y_test_pred = model_LR.predict(test_X)
LR_y_train_pred = model_LR.predict(train_X)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
from sklearn.metrics import mean_squared_error
print("LR RMSE train data: ", np.sqrt(mean_squared_error(train_y, LR_y_train_pred)))
print("LR RMSE train data: ", np.sqrt(mean_squared_error(test_y, LR_y_test_pred)))

NameError: name 'train_y' is not defined