### Import

In [14]:
from sqlalchemy import create_engine
import os
import yaml
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


### Read Config

In [15]:
base = Path().resolve()
db_conf_file = os.path.normpath(os.path.join(base, "../conf/db_connection.yaml"))
with open(db_conf_file, "r", encoding="utf-8") as l_file:
    db_conf_all = yaml.safe_load(l_file)

### Create Engine

In [16]:
ora_conf = db_conf_all["oracle"]
engine = create_engine(f"oracle://{ora_conf['username']}:{ora_conf['password']}@{ora_conf['dsn']}/?encoding=UTF-8&nencoding=UTF-8")

### Condition

In [17]:
c_cource_id = "06_t2500"

### Query

In [18]:
from sqlalchemy.sql import select
from sqlalchemy.sql import join

from model.race import Race
from keibaai.model.race import RaceUma

query_statement = (
    select(
        join(Race, RaceUma, Race.id == RaceUma.race_id)
        ).where(
            Race.course_id == c_cource_id
        )
)
print(query_statement)

SELECT races.id, races.name, races.race_track_id, races.kai, races.nichi, races.race_no, races.course_id, races.grade_id, races.is_win5, races.condition, races.handicap, races.race_date, races.race_start, races.weather, races.going, races.num_of_horses, races.race_data, races.corner_order_1, races.corner_order_2, races.corner_order_3, races.corner_order_4, races.pace, race_uma.race_id, race_uma.uma_id, race_uma.result, race_uma.bracket_number, race_uma.horse_number, race_uma.gender, race_uma.age, race_uma.weight_to_carry, race_uma.jockey_id, race_uma.time, race_uma.margin, race_uma.ninki, race_uma.win_odds, race_uma.final_3_furlong, race_uma.corner_order, race_uma.trainer_id, race_uma.horse_weight, race_uma.gain_and_loss_weight, race_uma.is_excluded, race_uma.is_demoted 
FROM races JOIN race_uma ON races.id = race_uma.race_id 
WHERE races.course_id = :course_id_1


In [19]:
query_statement = (
    select(
        Race.kai, Race.nichi, Race.race_no, Race.grade_id, Race.num_of_horses, RaceUma.result, RaceUma.bracket_number, RaceUma.horse_number, RaceUma.age, RaceUma.weight_to_carry, RaceUma.time, RaceUma.ninki, RaceUma.win_odds, RaceUma.final_3_furlong, RaceUma.horse_weight, RaceUma.gain_and_loss_weight
    ).select_from(
        join(Race, RaceUma, Race.id == RaceUma.race_id)
    ).where(
        Race.course_id == c_cource_id
    )
)
print(query_statement)

SELECT races.kai, races.nichi, races.race_no, races.grade_id, races.num_of_horses, race_uma.result, race_uma.bracket_number, race_uma.horse_number, race_uma.age, race_uma.weight_to_carry, race_uma.time, race_uma.ninki, race_uma.win_odds, race_uma.final_3_furlong, race_uma.horse_weight, race_uma.gain_and_loss_weight 
FROM races JOIN race_uma ON races.id = race_uma.race_id 
WHERE races.course_id = :course_id_1


### Read Data

In [20]:
data = pd.read_sql_query(sql=query_statement, con=engine)
data

Unnamed: 0,kai,nichi,race_no,grade_id,num_of_horses,result,bracket_number,horse_number,age,weight_to_carry,time,ninki,win_odds,final_3_furlong,horse_weight,gain_and_loss_weight
0,40,60,9,7,8,1.0,6,6,3,54.0,154.1,3,4.8,35.3,452,6.0
1,40,60,9,7,8,2.0,4,4,4,57.0,154.1,1,2.2,35.5,480,6.0
2,40,60,9,7,8,3.0,2,2,3,54.0,154.2,2,2.9,35.2,494,6.0
3,40,60,9,7,8,4.0,1,1,5,57.0,154.5,6,24.7,35.4,476,-4.0
4,40,60,9,7,8,5.0,8,8,6,57.0,154.6,7,28.0,35.6,488,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,40,30,7,18,8,4.0,2,2,4,55.0,155.9,5,17.1,34.7,480,-2.0
1678,40,30,7,18,8,5.0,3,3,3,54.0,156.0,3,3.5,34.9,458,-2.0
1679,40,30,7,18,8,6.0,6,6,3,54.0,156.4,7,131.4,34.9,466,0.0
1680,40,30,7,18,8,7.0,1,1,4,53.0,156.6,6,45.7,35.6,422,-10.0


### Describe

In [21]:
data.describe()

Unnamed: 0,kai,nichi,race_no,grade_id,num_of_horses,result,bracket_number,horse_number,age,weight_to_carry,time,ninki,win_odds,final_3_furlong,horse_weight,gain_and_loss_weight
count,1682.0,1682.0,1682.0,1682.0,1682.0,1674.0,1682.0,1682.0,1682.0,1682.0,1674.0,1682.0,1682.0,1674.0,1682.0,1676.0
mean,35.58264,45.563615,3.495838,7.705707,13.021403,6.974313,4.891795,7.010702,4.931629,55.364447,155.558662,7.010702,46.907194,36.151553,482.390012,0.787589
std,12.759595,25.016398,3.713127,5.200977,2.323768,3.959742,2.246593,3.9818,1.401779,1.69246,2.192507,3.9818,71.848295,1.500819,27.730239,6.661557
min,10.0,10.0,0.0,1.0,7.0,1.0,1.0,1.0,3.0,49.0,150.0,1.0,1.4,33.4,352.0,-24.0
25%,30.0,21.0,0.0,6.0,12.0,4.0,3.0,4.0,4.0,54.0,154.2,4.0,6.9,35.1,464.0,-4.0
50%,40.0,40.0,1.0,7.0,13.0,7.0,5.0,7.0,5.0,56.0,155.4,7.0,18.4,35.8,482.0,0.0
75%,50.0,71.0,8.0,7.0,15.0,10.0,7.0,10.0,6.0,57.0,156.6,10.0,52.3,36.9,500.0,4.0
max,50.0,91.0,9.0,30.0,16.0,16.0,8.0,16.0,12.0,59.0,172.2,16.0,652.7,48.0,578.0,34.0


### Standardization

In [22]:
data_st = data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(data_st)
data_st = pd.DataFrame(df_scaled, columns=data_st.columns)
data_st.head()

Unnamed: 0,kai,nichi,race_no,grade_id,num_of_horses,result,bracket_number,horse_number,age,weight_to_carry,time,ninki,win_odds,final_3_furlong,horse_weight,gain_and_loss_weight
0,0.346302,0.577249,1.482793,-0.135728,-2.161531,-1.509214,0.493429,-0.253906,-1.378394,-0.806431,-0.665493,-1.007558,-0.586231,-0.567562,-1.096242,0.782695
1,0.346302,0.577249,1.482793,-0.135728,-2.161531,-1.256597,-0.397073,-0.756341,-0.664803,0.966663,-0.665493,-1.509993,-0.622429,-0.434262,-0.086214,0.782695
2,0.346302,0.577249,1.482793,-0.135728,-2.161531,-1.00398,-1.287575,-1.258775,-1.378394,-0.806431,-0.619869,-1.258775,-0.612684,-0.634212,0.418801,0.782695
3,0.346302,0.577249,1.482793,-0.135728,-2.161531,-0.751363,-1.732825,-1.509993,0.048789,0.966663,-0.482999,-0.253906,-0.309176,-0.500912,-0.230503,-0.718904
4,0.346302,0.577249,1.482793,-0.135728,-2.161531,-0.498745,1.383931,0.248529,0.76238,0.966663,-0.437375,-0.002688,-0.263233,-0.367611,0.202366,0.482375


### Correlation

In [23]:
corr_matrix = data.corr()
corr_y = pd.DataFrame({"features":data.columns, "corr_y":corr_matrix["result"]}, index=None)
corr_y = corr_y.reset_index(drop=True)
corr_y.style.background_gradient()

Unnamed: 0,features,corr_y
0,kai,0.064386
1,nichi,0.11105
2,race_no,-0.032651
3,grade_id,-0.096029
4,num_of_horses,0.288559
5,result,1.0
6,bracket_number,0.027186
7,horse_number,0.13043
8,age,0.263512
9,weight_to_carry,-0.123882


In [24]:
data.corr()

Unnamed: 0,kai,nichi,race_no,grade_id,num_of_horses,result,bracket_number,horse_number,age,weight_to_carry,time,ninki,win_odds,final_3_furlong,horse_weight,gain_and_loss_weight
kai,1.0,0.457361,0.132393,0.085997,0.225893,0.064386,-0.03972,0.065915,-0.285965,-0.100469,-0.138273,0.065915,-0.018658,-0.1113,0.002922,0.165982
nichi,0.457361,1.0,0.053405,0.048332,0.392812,0.11105,-0.045382,0.114622,-0.141042,-0.19157,-0.08394,0.114622,-0.062658,0.054168,-0.012275,0.152787
race_no,0.132393,0.053405,1.0,0.333898,-0.119815,-0.032651,0.000303,-0.034962,-0.216008,-0.159784,0.240128,-0.034962,-0.045938,0.075481,-0.036868,0.074096
grade_id,0.085997,0.048332,0.333898,1.0,-0.334677,-0.096029,0.02222,-0.097658,-0.240776,-0.221641,0.296104,-0.097658,-0.079381,0.107557,-0.131261,0.062023
num_of_horses,0.225893,0.392812,-0.119815,-0.334677,1.0,0.288559,-0.060064,0.291799,0.072586,-0.095311,-0.084078,0.291799,0.128619,0.164583,0.075737,0.021292
result,0.064386,0.11105,-0.032651,-0.096029,0.288559,1.0,0.027186,0.13043,0.263512,-0.123882,0.303501,0.589422,0.460455,0.470723,0.027529,0.022882
bracket_number,-0.03972,-0.045382,0.000303,0.02222,-0.060064,0.027186,1.0,0.903613,-0.024829,-0.037889,0.030489,-0.001733,0.039844,0.025844,-0.026766,0.000478
horse_number,0.065915,0.114622,-0.034962,-0.097658,0.291799,0.13043,0.903613,1.0,-0.001148,-0.081924,-0.003895,0.103776,0.083211,0.071617,-0.004283,0.022862
age,-0.285965,-0.141042,-0.216008,-0.240776,0.072586,0.263512,-0.024829,-0.001148,1.0,0.213739,0.02439,0.438812,0.435432,0.063917,0.095845,-0.068178
weight_to_carry,-0.100469,-0.19157,-0.159784,-0.221641,-0.095311,-0.123882,-0.037889,-0.081924,0.213739,1.0,-0.120587,-0.173332,-0.105249,-0.059176,0.199205,0.031503


### Model

In [25]:
from sklearn.model_selection import train_test_split
train_X = data_st[["bracket_number", "horse_number", "weight_to_carry", "win_odds", "horse_weight"]]
train_y = data_st["result"]
(train_X, test_X, train_y, test_y) = train_test_split(train_X, train_y, test_size=0.25, random_state=666)

### 重回帰

In [26]:
from sklearn.linear_model import LinearRegression
model_LR = LinearRegression()
model_LR.fit(train_X, train_y)
LR_y_test_pred = model_LR.predict(test_X)
LR_y_train_pred = model_LR.predict(train_X)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### 予測精度算出

In [None]:
from sklearn.metrics import mean_squared_error
print("LR RMSE train data: ", np.sqrt(mean_squared_error(train_y, LR_y_train_pred)))
print("LR RMSE train data: ", np.sqrt(mean_squared_error(test_y, LR_y_test_pred)))

: 

In [None]:
from sklearn.metrics import r2_score
print("LR r^2 train data: ", r2_score(train_y, LR_y_train_pred))
print("LR r^2 train data: ", r2_score(test_y, LR_y_test_pred))

: 

In [None]:
from sklearn.metrics import mean_absolute_error
print("LR MAE train data: ", mean_absolute_error(train_y, LR_y_train_pred))
print("LR MAE train data: ", mean_absolute_error(test_y, LR_y_test_pred))

: 

In [None]:
flg, ax = plt.subplots(figsize=(10, 6.0))
plt.xticks(rotation=45, fontsize=9)

plt.scatter(train_y, LR_y_train_pred)
plt.plot(train_y, train_y, "red")

plt.legend(fontsize=14)

: 

In [None]:
flg, ax = plt.subplots(figsize=(10, 6.0))
plt.xticks(rotation=45, fontsize=9)

plt.scatter(test_y, LR_y_test_pred)
plt.plot(test_y, test_y, "red")

plt.legend(fontsize=14)

: 