# Data Preparation

In [1]:
import pandas as pd

data = pd.read_csv("athletes.csv")

In [2]:
import numpy as np
# Remove not relevant columns
data = data.dropna(subset=['region','age','weight','height','howlong','gender','eat', \
                               'train','background','experience','schedule','howlong', \
                               'deadlift','candj','snatch','backsq','experience',\
                               'background','schedule','howlong'])
data = data.drop(columns=['affiliate','team','name','fran','helen','grace',\
                              'filthy50','fgonebad','run400','run5k','pullups', 'train'])

# Remove Outliers

data = data[data['weight'] < 1500]
data = data[data['gender'] != '--']
data = data[data['age'] >= 18]
data = data[(data['height'] < 96) & (data['height'] > 48)]

data = data[(data['deadlift'] > 0) & (data['deadlift'] <= 1105)|((data['gender'] == 'Female') \
                 & (data['deadlift'] <= 636))]
data = data[(data['candj'] > 0) & (data['candj'] <= 395)]
data = data[(data['snatch'] > 0) & (data['snatch'] <= 496)]
data_v2 = data[(data['backsq'] > 0) & (data['backsq'] <= 1069)]

# Clean Survey Data

decline_dict = {'Decline to answer|': np.nan}
data = data.replace(decline_dict)
data = data.dropna(subset=['background','experience','schedule','howlong','eat'])

In [3]:
# Add Total Lift Column
data['total_lift'] = data['deadlift'] + data['candj'] + data['snatch'] + data['backsq']
data = data.dropna(subset=["age", "weight", "height",'total_lift'])

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30063 entries, 21 to 422961
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   athlete_id  30063 non-null  float64
 1   region      30063 non-null  object 
 2   gender      30063 non-null  object 
 3   age         30063 non-null  float64
 4   height      30063 non-null  float64
 5   weight      30063 non-null  float64
 6   candj       30063 non-null  float64
 7   snatch      30063 non-null  float64
 8   deadlift    30063 non-null  float64
 9   backsq      30063 non-null  float64
 10  eat         30063 non-null  object 
 11  background  30063 non-null  object 
 12  experience  30063 non-null  object 
 13  schedule    30063 non-null  object 
 14  howlong     30063 non-null  object 
 15  total_lift  30063 non-null  float64
dtypes: float64(9), object(7)
memory usage: 3.9+ MB


In [5]:
# Create event_timestamp (required by Feast)
data['event_timestamp'] = pd.Timestamp.now()

In [6]:
# Version 1: only 3 features
v1_df = data[['athlete_id','age', 'weight', 'height', 'event_timestamp', 'total_lift']]

# Version 2: with 4 extra features
v2_df = data[['athlete_id', 'age', 'weight', 'height', 'deadlift', 'candj', 'snatch', 'backsq', 'event_timestamp', 'total_lift']]

In [7]:
v1_df.to_parquet('./v1_features.parquet', index=False)
v2_df.to_parquet('./v2_features.parquet', index=False)