前処理

In [2]:
import pandas as pd

columns = ["着順", "枠番", "馬番", "馬名", "性齢", "斤量", "騎手", "タイム", "着差", "単勝", "人気", "馬体重", "調教師"]
year_list = ["2022", "2021", "2020", "2019", "2018", "2017", "2016", "2015", "2014"]
#year_list = ["2023"]


merged_df = pd.DataFrame(columns=columns)

for year in year_list:
    print(year)
    file_path = "../data/result/" + year + "_data.csv"
    df = pd.read_csv(file_path, names=columns)
    merged_df = pd.concat([merged_df, df])

#データの整形
merged_df[['性別', '年齢']] = merged_df['性齢'].str.extract(r'([牡牝])(\d+)')
merged_df["性別"].replace(["牡", "牝"], [0, 1], inplace=True)
values_to_remove = ['中', '取', '除', '4(降)', '5(降)']
merged_df = merged_df[~merged_df['着順'].isin(values_to_remove)]
merged_df.drop(columns=['馬番', '性齢', '馬体重', 'タイム', '着差', '単勝', '調教師'], inplace=True)
#欠損値埋め
merged_df['性別'].fillna(0, inplace=True)
merged_df['年齢'].fillna(3, inplace=True)
#型変換
merged_df['年齢'] = merged_df['年齢'].astype(int)
merged_df['着順'] = merged_df['着順'].astype(int)
merged_df.loc[merged_df['着順'] > 3, '着順'] = 0
merged_df['枠番'] = merged_df['枠番'].astype(int)

unique_horses = merged_df['馬名'].unique()
unique_jockeys = merged_df['騎手'].unique()
horse_to_number = {horse: idx for idx, horse in enumerate(unique_horses, start=1)}
jockey_to_number = {jockey: idx for idx, jockey in enumerate(unique_jockeys, start=1)}

# 馬名と騎手名を数字に置換
merged_df['馬名'] = merged_df['馬名'].map(horse_to_number)
merged_df['騎手'] = merged_df['騎手'].map(jockey_to_number)

# 数字と馬名・騎手名の対応関係を保存するデータフレームを作成
horse_number_df = pd.DataFrame(list(horse_to_number.items()), columns=['馬名', '馬名数字'])
jockey_number_df = pd.DataFrame(list(jockey_to_number.items()), columns=['騎手', '騎手数字'])

# CSVファイルに対応関係を保存
horse_number_df.to_csv('horse_number_mapping.csv', index=False)
jockey_number_df.to_csv('jockey_number_mapping.csv', index=False)

2022
2021
2020
2019
2018
2017
2016
2015
2014


CSV出力

In [3]:
merged_df.to_csv('train.csv', encoding='utf-8', index=False)

In [5]:
merged_df['着順'].value_counts()

着順
0    6806
1     582
3     580
2     579
Name: count, dtype: int64