In [6]:
### Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor

# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [7]:
url = "https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz"
players = pd.read_csv(url)
players

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [8]:
players_filtered = players[["age", "played_hours", "experience"]]
players_filtered

Unnamed: 0,age,played_hours,experience
0,9,30.3,Pro
1,17,3.8,Veteran
2,17,0.0,Veteran
3,21,0.7,Amateur
4,21,0.1,Regular
...,...,...,...
191,17,0.0,Amateur
192,22,0.3,Veteran
193,17,0.0,Amateur
194,17,2.3,Amateur


In [9]:
experience_map = {
    "Amateur": 1,
    "Beginner": 2,
    "Regular": 3,
    "Pro": 4,
    "Veteran": 5
}

players_filtered = players_filtered.assign(
    experience_num = players_filtered['experience'].map(experience_map)
)

players_filtered

Unnamed: 0,age,played_hours,experience,experience_num
0,9,30.3,Pro,4
1,17,3.8,Veteran,5
2,17,0.0,Veteran,5
3,21,0.7,Amateur,1
4,21,0.1,Regular,3
...,...,...,...,...
191,17,0.0,Amateur,1
192,22,0.3,Veteran,5
193,17,0.0,Amateur,1
194,17,2.3,Amateur,1


In [10]:
players_filtered = players_filtered.assign(
    playtime_grouped=pd.cut(
        players_filtered["played_hours"],
        bins=[0, 1, 5, 10, 15, 20, 50, 100, 200, 223.1], 
        labels=["<1", "1–5", "5–10", "10–15", "15-20", "20–50", "50–100", "100-200", "200>"]
    )
)

players_plot4 = alt.Chart(players_filtered, title = "Playtime in Hours: Player age and Experience Level").mark_circle(size=40).encode(
    x=alt.X("age:Q").title("Age (years)"),
    y=alt.Y("experience_num:O", 
            title = "Experience (numbered low to high)",
            scale=alt.Scale(reverse=True)
), color=alt.Color("playtime_grouped:N", sort = ["<1", "1–5", "5–10", "10–15", "15-20", "20–50", "50–100", "100-200", "200>"]).scale(scheme="paired").title("Playtime (hours, grouped)")
).properties(width=850, height=200)
players_plot4

In [11]:
#split filtered data into training and testing with 70/30 split using random_state = 2000
training_data, testing_data = train_test_split(
    players_filtered, test_size = 0.3, random_state = 2000
)


In [17]:
#linear regression
X_train = training_data[["age", "experience_num"]]
y_train = training_data["played_hours"]
X_test = testing_data[["age", "experience_num"]]
y_test = testing_data["played_hours"]

lm = LinearRegression()
players_train_fit = lm.fit(X_train, y_train)
print(players_train_fit.coef_)
print(players_train_fit.intercept_)

#finding rmse
train_predictions = players_train_fit.predict(X_train)
lm_rmse = mean_squared_error(train_predictions, y_train)**0.5
print(lm_rmse)

#finding rmspe
test_predictions = players_train_fit.predict(X_test)
lm_rmspe = mean_squared_error(test_predictions, y_test)**0.5
print(lm_rmspe)


[-0.08066138 -0.14859094]
6.937005449675615


$\: played \: hours = 6.937 -0.08066*age  + -0.14859*experience$

np.float64(25.137087062308986)

np.float64(34.45033724343876)