In [6]:
library(repr)
library(tidyverse)
library(tidymodels)
library(readxl)
library(janitor)

In [30]:
tennis_data <- read_csv("Data/player_stats.csv", skip = 1)

tennis_df <- tennis_data |>
    clean_names() |>
    select(age, current_rank, prize_money, height, seasons) |>
    separate(col = height, 
            into = c("height_cm", "discard"), 
           sep = " ") |>
    separate(col = current_rank, 
            into = c("rank", "discard_1"), 
           sep = " ") |>
    separate(col = age, 
            into = c("age", "discard_2"), 
           sep = " ") |>
    select(-discard_2, -discard_1, -discard) |>
    mutate(age = as.numeric(age),
           rank = as.numeric(rank),
           height_cm = as.numeric(height_cm))

means <- tennis_df |>
            select(age:seasons) |>
            map_dfr(mean, na.rm = TRUE)
means

tennis_df <- tennis_df |>
                replace_na(list(age = 25.96794, 
                                prize_money = 3416440, 
                                height_cm = 185.7913, 
                                seasons = 6.494652)) |>
    arrange(rank)
    


rank_quantile <- tennis_df |>
    select(rank) |>
    quantile(probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
rank_quantile

category_columns <- tennis_df |>
    mutate(category_1 = rank < 125.5) |>
    mutate(category_2 = 125.5 < rank & rank < 249) |>
    mutate(category_3 = 249 < rank & rank < 372.5) |>
    mutate(category_4 = 372.5 < rank) 


tennis_df_longer <- category_columns |> 
    pivot_longer(cols = category_1:category_4, 
                 names_to = "category", 
                 values_to = "status") |>
    filter(status == "TRUE") |>
    select(-status) |>
    drop_na()
tennis_df_longer
    
tennis_split <- initial_split(tennis_df_longer, prop = 0.75, strata = category)
tennis_train <- training(tennis_split)
tennis_test <- testing(tennis_split)




[1m[22mNew names:
[36m•[39m `` -> `...1`
[1mRows: [22m[34m500[39m [1mColumns: [22m[34m38[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (24): Age, Country, Plays, Wikipedia, Current Rank, Best Rank, Name, Bac...
[32mdbl[39m (14): ...1, Prize Money, Turned Pro, Seasons, Titles, Best Season, Retir...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,rank,prize_money,height_cm,seasons
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
25.96794,249.0283,3416440,185.7913,6.494652


age,rank,prize_money,height_cm,seasons,category
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
33,1,119601561,185.0000,19,category_1
32,2,139144944,188.0000,17,category_1
38,3,129231891,185.0000,22,category_1
23,4,10507693,185.7913,5,category_1
26,5,22132368,185.0000,10,category_1
21,6,10425605,185.7913,4,category_1
22,7,20028563,198.0000,8,category_1
23,8,3580862,185.7913,3,category_1
31,9,12067808,183.0000,11,category_1
33,10,17930816,193.0000,17,category_1
