In [1]:
# Importing libraries
library(tidyverse)
library(tidymodels)
library(repr)
library(RColorBrewer)


player_data = read_csv("https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz")
session_data = read_csv("https://drive.google.com/uc?export=download&id=14O91N5OlVkvdGxXNJUj5jIsV5RexhzbB")

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [2]:
set.seed(1)
# Cleaning and wrangling data

# Remove subscription, gender, individualId, and organizationName variables from player dataset
cleaned_player_data <- select(player_data, experience, hashedEmail, played_hours, name, age)
cleaned_player_data

# Separate start_time and end_time into individual date & time columns for session dataset
separated_session_data <- session_data |>
    separate(start_time, into = c("start_date", "start_time"), sep = " ") |>
    separate(end_time, into = c("end_date", "end_time"), sep = " ")

wrangled_session_data <- separated_session_data |>
    mutate(start_date = as.POSIXct(start_date, format = "%d/%m/%Y"), end_date = as.POSIXct(end_date, format = "%d/%m/%Y")) |>
    mutate(start_time = as.POSIXct(start_time, format = "%H:%M"))

# Remove original_start_time & original_end_time from session dataset
cleaned_session_data <- select(wrangled_session_data, hashedEmail, start_date, start_time, end_date, end_time)
cleaned_session_data

experience,hashedEmail,played_hours,name,age
<chr>,<chr>,<dbl>,<chr>,<dbl>
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9
Veteran,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,17
Veteran,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,17
Amateur,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5,0.7,Flora,21
Regular,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb0af4d48fcce2420f3e,0.1,Kylie,21
Amateur,f58aad5996a435f16b0284a3b267f973f9af99e7a89bee0430055a44fa92f977,0.0,Adrian,17
Regular,8e594b8953193b26f498db95a508b03c6fe1c24bb5251d392c18a0da9a722807,0.0,Luna,19
Amateur,1d2371d8a35c8831034b25bda8764539ab7db0f63938696917c447128a2540dd,0.0,Emerson,21
Amateur,8b71f4d66a38389b7528bb38ba6eb71157733df7d1740371852a797ae97d82d1,0.1,Natalie,17
Veteran,bbe2d83de678f519c4b3daa7265e683b4fe2d814077f9094afd11d8f217039ec,0.0,Nyla,22


hashedEmail,start_date,start_time,end_date,end_time
<chr>,<dttm>,<dttm>,<dttm>,<chr>
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,2024-06-30,2024-11-29 18:12:00,2024-06-30,18:24
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,2024-06-17,2024-11-29 23:33:00,2024-06-17,23:46
f8f5477f5a2e53616ae37421b1c660b971192bd8ff77e3398304c7ae42581fdc,2024-07-25,2024-11-29 17:34:00,2024-07-25,17:57
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,2024-07-25,2024-11-29 03:22:00,2024-07-25,03:58
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,2024-05-25,2024-11-29 16:01:00,2024-05-25,16:12
bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8aa0c4bf95ccee6bf,2024-06-23,2024-11-29 15:08:00,2024-06-23,17:10
fd6563a4e0f6f4273580e5fedbd8dda64990447aea5a33cbb5e894a3867ca44d,2024-04-15,2024-11-29 07:12:00,2024-04-15,07:21
ad6390295640af1ed0e45ffc58a53b2d9074b0eea694b16210addd44d7c81f83,2024-09-21,2024-11-29 02:13:00,2024-09-21,02:30
96e190b0bf3923cd8d349eee467c09d1130af143335779251492eb4c2c058a5f,2024-06-21,2024-11-29 02:31:00,2024-06-21,02:49
36d9cbb4c6bc0c1a6911436d2da0d09ec625e43e6552f575d4acc9cf487c4686,2024-05-16,2024-11-29 05:13:00,2024-05-16,05:52


In [3]:
# Combining both datasets in order to split into training and testing sets 
combined_data <- cleaned_player_data |>
  inner_join(cleaned_session_data, by = "hashedEmail")

# Splitting dataframe into training and testing datasets
split_data <- initial_split(combined_data, prop = 3/4, strata = hashedEmail)

training_data = training(split_data)
testing_data = testing(split_data)

head(training_data)
head(testing_data)

experience,hashedEmail,played_hours,name,age,start_date,start_time,end_date,end_time
<chr>,<chr>,<dbl>,<chr>,<dbl>,<dttm>,<dttm>,<dttm>,<chr>
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-08-08,2024-11-29 00:21:00,2024-08-08,01:35
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-09-08,2024-11-29 18:29:00,2024-09-08,20:12
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-08-11,2024-11-29 15:40:00,2024-08-11,15:52
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-05-04,2024-11-29 22:06:00,2024-05-04,22:17
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-08-24,2024-11-29 23:07:00,2024-08-25,02:38
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-06-30,2024-11-29 02:56:00,2024-06-30,03:16


experience,hashedEmail,played_hours,name,age,start_date,start_time,end_date,end_time
<chr>,<chr>,<dbl>,<chr>,<dbl>,<dttm>,<dttm>,<dttm>,<chr>
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-08-08,2024-11-29 02:41:00,2024-08-08,03:25
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-05-05,2024-11-29 22:21:00,2024-05-05,23:17
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-06-15,2024-11-29 16:37:00,2024-06-15,18:37
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-08-03,2024-11-29 02:30:00,2024-08-03,03:54
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-04-08,2024-11-29 01:17:00,2024-04-08,01:43
Pro,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,9,2024-08-25,2024-11-29 21:54:00,2024-08-26,01:01


In [7]:
# Summarizing the training data to obtain average hours played by player type
data_summary <- training_data |> 
    group_by(experience) |>
    summarize(avg_played_hours = mean(played_hours, na.rm = TRUE),
              min_age = min(age, na.rm = TRUE),
              max_age = max(age, na.rm = TRUE),
              player_count = n())

data_summary

experience,avg_played_hours,min_age,max_age,player_count
<chr>,<dbl>,<dbl>,<dbl>,<int>
Amateur,63.932956,11,32,619
Beginner,7.148052,17,37,77
Pro,19.777419,9,91,31
Regular,182.999739,8,99,383
Veteran,3.095,16,44,40
