In [23]:
import pandas as pd


In [24]:
df = pd.read_csv('시험용.csv', low_memory=False)
df['wgDf']= df['wgDf'].fillna(0)


## One Hot Encode Stuff

In [25]:
df = pd.get_dummies(df, columns=['ageCond'])
df = pd.get_dummies(df, columns=['budam'])
df = pd.get_dummies(df, columns=['name'])
df = pd.get_dummies(df, columns=['weather'])
df = df[df['rcName'] == '일반']
df.head()

Unnamed: 0,birthday,age,winOdds,sj_3cOrd,ord,wgHr,track,wgDf,humidity,buga1,...,name_일본,name_캐나다,name_한국,name_호주,weather_강풍,weather_눈,weather_맑음,weather_비,weather_안개,weather_흐림
0,20150308,4,10.7,8,1,472,1,-12.0,2,0,...,False,False,True,False,False,False,True,False,False,False
1,20160326,3,2.8,2,2,457,1,3.0,2,0,...,False,False,True,False,False,False,True,False,False,False
2,20160520,3,3.4,6,3,462,1,-1.0,2,0,...,False,False,True,False,False,False,True,False,False,False
3,20150314,4,22.0,3,4,477,1,-10.0,2,0,...,False,False,True,False,False,False,True,False,False,False
4,20160412,3,94.8,10,5,469,1,20.0,2,0,...,False,False,True,False,False,False,True,False,False,False


## Time for Feature Engineering

### rcDate and creating winRates

In [26]:
df['rcDate'] = pd.to_datetime(df['rcDate'])

In [27]:
df['time_since_last_race'] = df.groupby('hrName')['rcDate'].diff().dt.days

df['time_since_last_race'] = df['time_since_last_race'].fillna(9999)

# Create the 'timeRace' column based on the specified conditions
def categorize_time(days):
    if days > 90:
        return 7
    elif days > 75:
        return 6
    elif days > 60:
        return 5
    elif days > 45:
        return 4
    elif days > 30:
        return 3
    elif days > 15:
        return 2
    elif days > 1:
        return 1
    else:
        return 0

df['timeRace'] = df['time_since_last_race'].apply(categorize_time)

# Drop the 'time_since_last_race' column as it's no longer needed
df = df.drop(columns=['time_since_last_race'])



### finishing position in past races 

Dropped race_position

In [28]:
df['num_horses'] = df.groupby(['rankOrigin', 'noob', 'maxRt', 'rank', 'rcDate', 'rcDist'])['ord'].transform('max')
df['race_position'] = df['ord'] / df['num_horses']
df.drop(columns=['num_horses'], inplace=True)


In [29]:
def calculate_good_race_past(group):
    group = group.sort_values(by='rcDate')
    good_races = []
    for i in range(len(group)):
        if i >= 3:
            avg_race_position = group.iloc[:i]['race_position'].tail(3).mean()
            good_races.append(1 if avg_race_position < 0.4 else 0)
        else:
            good_races.append(0)  # Not enough past races to determine good race status
    return pd.Series(good_races, index=group.index)

df['horse_good_race'] = df.groupby('hrName').apply(calculate_good_race_past).reset_index(level=0, drop=True)
df['jockey_good_race'] = df.groupby('jkName').apply(calculate_good_race_past).reset_index(level=0, drop=True)
df['trainer_good_race'] = df.groupby('trName').apply(calculate_good_race_past).reset_index(level=0, drop=True)


In [30]:
df.drop(columns=['race_position'], inplace=True)


### Lengths behind winner in past races

In [31]:
# Distance from First place horse
df['chakcha'] = df.groupby(['rcDate', 'rankOrigin', 'noob', 'maxRt', 'rank', 'rcDist'])['diffUnit'].cumsum()

In [32]:
def calculate_distFirst_past(group):
    group = group.sort_values(by='rcDate')
    avg_distances = []
    for i in range(len(group)):
        if i >= 1:
            avg_race_position = group.iloc[:i]['chakcha'].tail(min(3, i)).mean()
            avg_distances.append(avg_race_position)
        else:
            avg_distances.append(0)  # If it's the first race, there are no past races
    return pd.Series(avg_distances, index=group.index)

# Applying the function to each group to calculate AVGdistFirst
df['AVGdistFirst'] = df.groupby('hrName').apply(calculate_distFirst_past).reset_index(level=0, drop=True)


### Weight Carried in Past Races

In [33]:
def calculate_avg_weight(group):
    group = group.sort_values(by='rcDate')
    avg_weights = group['wgBudam'].rolling(window=3, min_periods=1).mean()
    return avg_weights

df['AVGwgBudam'] = df.groupby('hrName').apply(lambda x: calculate_avg_weight(x)).reset_index(level=0, drop=True)


### Mid race order conversion

In [34]:
# Dropping certain Columns for the data
df.drop(columns=["sj_3cOrd", "sj_1cOrd", "sj_2cOrd", "sj_4cOrd", "seG1fAccTime", "seG3fAccTime", "seS1fAccTime", "se_1cAccTime", "se_2cAccTime", "se_3cAccTime", "se_4cAccTime"]
, inplace=True)


In [35]:

def calculate_sjS1fOrd_past(group):
    group = group.sort_values(by='rcDate')
    avg_distances = []
    for i in range(len(group)):
        if i >= 1:
            avg_race_position = group.iloc[:i]['sjS1fOrd'].tail(min(3, i)).mean()
            avg_distances.append(avg_race_position)
        else:
            avg_distances.append(0)  # If it's the first race, there are no past races
    return pd.Series(avg_distances, index=group.index)

def calculate_sjG3fOrd_past(group):
    group = group.sort_values(by='rcDate')
    avg_distances = []
    for i in range(len(group)):
        if i >= 1:
            avg_race_position = group.iloc[:i]['sjG3fOrd'].tail(min(3, i)).mean()
            avg_distances.append(avg_race_position)
        else:
            avg_distances.append(0)  # If it's the first race, there are no past races
    return pd.Series(avg_distances, index=group.index)

def calculate_sjG1fOrd_past(group):
    group = group.sort_values(by='rcDate')
    avg_distances = []
    for i in range(len(group)):
        if i >= 1:
            avg_race_position = group.iloc[:i]['sjG1fOrd'].tail(min(3, i)).mean()
            avg_distances.append(avg_race_position)
        else:
            avg_distances.append(0)  # If it's the first race, there are no past races
    return pd.Series(avg_distances, index=group.index)

# Applying the function to each group to calculate AVGdistFirst
df['AVGsjS1fOrd'] = df.groupby('hrName').apply(calculate_sjS1fOrd_past).reset_index(level=0, drop=True)
df['AVGsjG3fOrd'] = df.groupby('hrName').apply(calculate_sjG3fOrd_past).reset_index(level=0, drop=True)
df['AVGsjG1fOrd'] = df.groupby('hrName').apply(calculate_sjG1fOrd_past).reset_index(level=0, drop=True)




In [36]:
df['rcTime']

0         63.1
1         63.1
2         63.9
3         64.1
4         64.1
         ...  
55000    113.3
55001    113.6
55002    113.8
55003    114.0
55004    119.7
Name: rcTime, Length: 51513, dtype: float64

### Dropping FInal not using columns

In [37]:
df.drop(columns=['buga1', 'buga2', 'buga3','hrName','hrNo', 'jkName', 'jkNo', 'owName', 'owNo','rcName','rcTime', 'sjG1fOrd', 'sjG3fOrd',
       'sjS1fOrd', 'trName', 'trNo',], inplace=True)

### Converting wgBudamBigo to int

In [38]:
df['wgBudamBigo'] = df['wgBudamBigo'].astype(str)
df['wgBudamBigo'] = df['wgBudamBigo'].replace({'*': 1, '-': 0})
df['wgBudamBigo'] = df['wgBudamBigo'].astype(int)


## FInal 

In [39]:
df.to_csv("Final_research.csv", index=False)
