# Cleaning the Training Data

In this file, I take the data and clean it so it will be ready for model building

In [1]:
import pandas as pd

A lot of the columns were not the correct type. The following code fixes all of this. 

In [2]:
def fix_types(df):
    df["blue_handbrake"] = df["blue_handbrake"].astype("bool")
    df["orange_handbrake"] = df["orange_handbrake"].astype("bool")

    df["blue_ball_cam"] = df["blue_ball_cam"].astype("bool")
    df["orange_ball_cam"] = df["orange_ball_cam"].astype("bool")

    df["blue_boost_active"] = df["blue_boost_active"].astype("bool")
    df["orange_boost_active"] = df["orange_boost_active"].astype("bool")

    df.loc[df["game_is_overtime"].isnull(), "game_is_overtime"] = False
    df.loc[df["game_is_overtime"] == "no_OT", "game_is_overtime"] = False
    df["game_is_overtime"] = df["game_is_overtime"].astype("bool")

    df["game_ball_has_been_hit"] = df["game_ball_has_been_hit"].astype("bool")

    df["blue_dodge_active"] = df["blue_dodge_active"].astype("bool")
    df["orange_dodge_active"] = df["orange_dodge_active"].astype("bool")

    df["blue_jump_active"] = df["blue_jump_active"].astype("bool")
    df["orange_jump_active"] = df["orange_jump_active"].astype("bool")

    df["blue_double_jump_active"] = df["blue_double_jump_active"].astype("bool")
    df["orange_double_jump_active"] = df["orange_double_jump_active"].astype("bool")
    
    return df

Using the existing columns, we can transform our data to create new columns that may be good predictors

In [3]:
def create_new_columns(df):
    df["blue_possesion"] = False
    df["orange_possesion"] = False

    df.loc[df["ball_hit_team_no"] == 0, "blue_possesion"] = True
    df.loc[df["ball_hit_team_no"] == 1, "orange_possesion"] = True

    df.drop("ball_hit_team_no", axis = 1, inplace = True)

    # 320 z is roughly the center of the goal in terms of height
    df["ball_dist_to_orange_goal"] = ((0 - df["ball_pos_x"])**2 + 
                                            (5120 - df["ball_pos_y"])**2 +
                                            (320 - df["ball_pos_z"])**2
                                           )**(1/2)

    df["ball_dist_to_blue_goal"] = ((0 - df["ball_pos_x"])**2 + 
                                          (-5120 - df["ball_pos_y"])**2 +
                                          (320 - df["ball_pos_z"])**2
                                         )**(1/2)

    # if a player's coordinates are null, they are demolished (dead for three seconds)
    df["blue_dead"] = False
    df.loc[df["blue_pos_x"].isnull(), "blue_dead"] = True

    df["orange_dead"] = False
    df.loc[df["orange_pos_x"].isnull(), "orange_dead"] = True

    return df

There are some missing values that we can fix. Specifically, velocities are missing when they should be 0. Also, when a player is demolished, they will respawn in one of four locations. Therefore, we can replace these cases of missing values by randomly selecting the potential respawn location. 

In [4]:
def fix_nas(df):
    # value fixes
    df.loc[df["ball_vel_x"].isnull(), "ball_vel_x"] = 0
    df.loc[df["ball_vel_y"].isnull(), "ball_vel_y"] = 0
    df.loc[df["ball_vel_z"].isnull(), "ball_vel_z"] = 0
    df.loc[df["ball_ang_vel_x"].isnull(), "ball_ang_vel_x"] = 0
    df.loc[df["ball_ang_vel_y"].isnull(), "ball_ang_vel_y"] = 0
    df.loc[df["ball_ang_vel_z"].isnull(), "ball_ang_vel_z"] = 0
    
    df.loc[df["blue_steer"].isnull(), "blue_steer"] = 127.5
    df.loc[df["orange_steer"].isnull(), "orange_steer"] = 127.5
    
    df.loc[df["blue_throttle"].isnull(), "blue_throttle"] = 0
    df.loc[df["orange_throttle"].isnull(), "orange_throttle"] = 0
    
    df.loc[df["blue_boost"].isnull(), "blue_boost"] = 0
    df.loc[df["orange_boost"].isnull(), "orange_boost"] = 0
    
    df.loc[df["blue_boost_collect"].isnull(), "blue_boost_collect"] = 0
    df.loc[df["orange_boost_collect"].isnull(), "orange_boost_collect"] = 0
    
    # demolition fixes
    respawn_x = [-2688.0, 2688.0, -2304.0, 2304.0]
    
    rep = df["blue_dead"].eq(True)
    df.loc[rep, "blue_pos_x"] = np.random.choice(respawn_x, size = rep.sum())
    df.loc[rep, "blue_pos_y"] = -4608.0
    df.loc[rep, "blue_pos_z"] = 33.51
    
    df.loc[rep, "blue_vel_x"] = 0.0
    df.loc[rep, "blue_vel_y"] = 0.0
    df.loc[rep, "blue_vel_z"] = -541.0
    
    df.loc[rep, "blue_ang_vel_x"] = 0.0
    df.loc[rep, "blue_ang_vel_y"] = 0.0
    df.loc[rep, "blue_ang_vel_z"] = 0.0
    
    df.loc[rep, "blue_rot_x"] = 0.0
    df.loc[rep, "blue_rot_y"] = 1.5708
    df.loc[rep, "blue_rot_z"] = 0.0
    
    df.loc[rep, "blue_boost"] = 85.0
    
    
    rep = df["orange_dead"].eq(True)
    df.loc[rep, "orange_pos_x"] = np.random.choice(respawn_x, size = rep.sum())
    df.loc[rep, "orange_pos_y"] = 4608.0
    df.loc[rep, "orange_pos_z"] = 33.51
    
    df.loc[rep, "orange_vel_x"] = 0.0
    df.loc[rep, "orange_vel_y"] = 0.0
    df.loc[rep, "orange_vel_z"] = -541.0
    
    df.loc[rep, "orange_ang_vel_x"] = 0.0
    df.loc[rep, "orange_ang_vel_y"] = 0.0
    df.loc[rep, "orange_ang_vel_z"] = 0.0
    
    df.loc[rep, "orange_rot_x"] = 0.0
    df.loc[rep, "orange_rot_y"] = -1.5708
    df.loc[rep, "orange_rot_z"] = 0.0
    
    df.loc[rep, "orange_boost"] = 85.0
    
    
    # now that the distances are all there, we can calculate each player's distance to the ball
    df["blue_dist_to_ball"] = ((df["blue_pos_x"] - df["ball_pos_x"])**2 + 
                                        (df["blue_pos_y"] - df["ball_pos_y"])**2 +
                                        (df["blue_pos_z"] - df["ball_pos_z"])**2
                                       )**(1/2)

    # if the ball is behind of player in terms of y location and shooting net, make distance negative
    df.loc[df["blue_pos_y"] > df["ball_pos_y"], 
                    "blue_dist_to_ball"] = df.loc[df["blue_pos_y"] > df["ball_pos_y"], 
                                                           "blue_dist_to_ball"] * (-1)
    
    # create boolean of whether the player is between the ball and their own goal
    df["blue_behind_ball"] = None
    df.loc[df["blue_dist_to_ball"] <= 0, "blue_behind_ball"] = False
    df.loc[df["blue_dist_to_ball"] > 0, "blue_behind_ball"] = True
    df["blue_behind_ball"] = df["blue_behind_ball"].astype("bool")

    df["orange_dist_to_ball"] = ((df["orange_pos_x"] - df["ball_pos_x"])**2 + 
                                          (df["orange_pos_y"] - df["ball_pos_y"])**2 +
                                          (df["orange_pos_z"] - df["ball_pos_z"])**2
                                         )**(1/2)

    # if the ball is behind of player in terms of y location and shooting net, make distance negative
    df.loc[df["orange_pos_y"] < df["ball_pos_y"], 
                    "orange_dist_to_ball"] = df.loc[df["orange_pos_y"] < df["ball_pos_y"], 
                                                             "orange_dist_to_ball"] * (-1)
    
    # create boolean of whether the player is between the ball and their own goal
    df["orange_behind_ball"] = None
    df.loc[df["orange_dist_to_ball"] <= 0, "orange_behind_ball"] = False
    df.loc[df["orange_dist_to_ball"] > 0, "orange_behind_ball"] = True
    df["orange_behind_ball"] = df["orange_behind_ball"].astype("bool")
    
    return df

In [5]:
def data_cleaning(df):
    df.reset_index(drop = True, inplace = True)
    df = df.loc[df["game_next_goal"].isnull() == False, ]
    df = df.loc[df["ball_pos_x"].isnull() == False, ]
    
    df = fix_types(df)
    
    df = create_new_columns(df)
    
    df = fix_nas(df)
    
    df.insert(77, "game_next_goal", df.pop("game_next_goal"))
    
    return df

Now the data is ready for model building, so we can store it as a pickle file

In [6]:
df_train = pd.read_pickle("training_data1_v3.pkl")
df_train.shape

(168558, 69)

In [7]:
df_train = df_train.append(pd.read_pickle("training_data2_v3.pkl"))
df_train.shape

(336895, 69)

In [8]:
df_train = df_train.append(pd.read_pickle("training_data3_v3.pkl"))
df_train.shape

(510558, 69)

In [9]:
df_train = data_cleaning(df_train)
df_train.shape

(467879, 78)

In [10]:
df_train.to_pickle("training_data_final_v3.pkl")

In [11]:
df_test = pd.read_pickle("testing_data_v3.pkl")

In [12]:
df_test = data_cleaning(df_test)
df_test.shape

(165136, 78)

In [13]:
df_test.to_pickle("testing_data_final_v3.pkl")

In [14]:
df_accuracy = pd.read_pickle("accuracy_data_v3.pkl")

In [15]:
df_accuracy = data_cleaning(df_accuracy)
df_accuracy.shape

(164639, 78)

In [16]:
df_accuracy.to_pickle("accuracy_data_final_v3.pkl")

In [6]:
df_analysis = pd.read_pickle("analysis_data_v3.pkl")

In [7]:
df_analysis = data_cleaning(df_analysis)
df_analysis.shape

(10116, 78)

In [9]:
df_analysis.to_pickle("analysis_data_final_v3.pkl")