# Title

# Introduction

### **Question:** Will the players' gaming experience, whether or not they subscribe to the game-related newsletter, and age become useful predictors for their total hours of playing?

# Methods & Results

In [None]:
### Run this cell before continuing.
library(tidyverse)
library(repr)
library(tidymodels)
library(GGally)
library(ISLR)
options(repr.matrix.max.rows = 6)

In [None]:
players <- read_csv("https://raw.githubusercontent.com/ryyuerber/DSCI100_Group_26_Final_Proj/refs/heads/main/players.csv")
sessions <- read_csv("https://raw.githubusercontent.com/ryyuerber/DSCI100_Group_26_Final_Proj/refs/heads/main/sessions.csv")

players
sessions

In [None]:
num_sessions <- sessions |>
    group_by(hashedEmail)|>
    summarize(count = n())

df <- left_join(players, num_sessions, by = 'hashedEmail') |>
    mutate(avg_session_hrs = played_hours / count) |>
    select(-hashedEmail, -name, -count)


In [None]:
players_mdfied <- df |>
    mutate(experience = as_factor(experience)) |>
    mutate(subscribe = as_factor(subscribe)) |>
    mutate(subscribe = fct_recode(subscribe, "Yes" = "TRUE", "No" = "FALSE")) #factorize two important column for later analysis

players_na_sum <- colSums(is.na(players_mdfied))

players_mdfied
players_na_sum

In [None]:
hours_played_summary <- players_mdfied |> summarize(played_hours_min = min(played_hours, na.rm = TRUE),
    played_hours_max = max(played_hours),
    played_hours_mean = mean(played_hours),
    played_hours_median = median(played_hours),
    played_hours_sd = sd(played_hours))

age_summary <- players_mdfied |> summarize(age_min = min(Age, na.rm = TRUE),
    age_max = max(Age, na.rm = TRUE),
    age_mean = mean(Age, na.rm = TRUE),
    age_median = median(Age, na.rm = TRUE))

ctg_summary <- players_mdfied |>
  select(experience, subscribe, gender) |>
  pivot_longer(cols = experience:gender, names_to = "Column", values_to = "categories") |>
  group_by(Column, categories) |>
  summarize(Count = n())

hours_played_summary
age_summary
print(ctg_summary)

In [None]:
players_numericalized <- players_mdfied |> 
    mutate(num_subscribe = case_when(subscribe == 'Yes' ~ 1, subscribe == 'No' ~ 0)) |>
    mutate(num_experience = case_when(experience == 'Beginner' ~ 1, 
                                      experience == 'Regular' ~ 2, 
                                      experience == 'Amateur' ~ 3, 
                                      experience == 'Veteran' ~ 4
                                      experience == 'Pro' ~ 5))

    # mutate(experience = fct_recode(experience, 1 = "Beginner", 2 = "Amateur", 3 = "Regular", 4 = "Veteran", 5 = "Pro"))|>
    # mutate(subscribe = fct_recode(subscribe, 1 = "Yes", "0" = "No"))
players_numericalized

players_split <- initial_split(players_numericalized, prop = 0.75, strata = played_hours)
players_training <- training(players_split)
players_testing <- testing(players_split)


# Discussion

# References (If applicable)