In [None]:
### Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor

# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [None]:
url = "https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz"
players = pd.read_csv(url)
players

In [None]:
players_filtered = players[["age", "played_hours", "experience"]]
players_filtered

In [None]:
experience_map = {
    "Amateur": 1,
    "Beginner": 2,
    "Regular": 3,
    "Pro": 4,
    "Veteran": 5
}

players_filtered = players_filtered.assign(
    experience_num = players_filtered['experience'].map(experience_map)
)

players_filtered

In [None]:
players_filtered = players_filtered.assign(
    playtime_grouped=pd.cut(
        players_filtered["played_hours"],
        bins=[0, 1, 5, 10, 15, 20, 50, 100, 200, 223.1], 
        labels=["<1", "1–5", "5–10", "10–15", "15-20", "20–50", "50–100", "100-200", "200>"]
    )
)

players_plot4 = alt.Chart(players_filtered, title = "Playtime in Hours: Player age and Experience Level").mark_circle(size=40).encode(
    x=alt.X("age:Q").title("Age (years)"),
    y=alt.Y("experience_num:O", 
            title = "Experience (numbered low to high)",
            scale=alt.Scale(reverse=True)
), color=alt.Color("playtime_grouped:N", sort = ["<1", "1–5", "5–10", "10–15", "15-20", "20–50", "50–100", "100-200", "200>"]).scale(scheme="paired").title("Playtime (hours, grouped)")
).properties(width=850, height=200)
players_plot4

In [None]:
#split filtered data into training and testing with 70/30 split using random_state = 2000
training_data, testing_data = train_test_split(
    players_filtered, test_size = 0.3, random_state = 2000
)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline


# Standardize age and experience_num 
players_preprocessor = make_column_transformer(
    (StandardScaler(), ["age", "experience_num"]),
    remainder="drop"
)

players_pipeline = make_pipeline(
    players_preprocessor,
    KNeighborsRegressor()
)


In [None]:
# GRID SEARCH (1–75 neighbors, step = 3) with 5-fold CV

param_grid = {
    "kneighborsregressor__n_neighbors": range(1, 76, 3)
}

players_gridsearch = GridSearchCV(
    estimator=players_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    return_train_score=False
)

# Fit on training data

players_gridsearch.fit(
    training_data[["age", "experience_num"]],   
    training_data["played_hours"]              
)

In [None]:
# Gather CV results 

cv_results = pd.DataFrame(players_gridsearch.cv_results_)

cv_results["sem_test_score"] = cv_results["std_test_score"] / (5**0.5)

cv_results = (
    cv_results[
        ["param_kneighborsregressor__n_neighbors",
         "mean_test_score",
         "sem_test_score"]
    ]
    .rename(columns={
        "param_kneighborsregressor__n_neighbors": "n_neighbors"
    })
)

cv_results["mean_test_score"] = -cv_results["mean_test_score"]

cv_results

In [None]:
players_gridsearch.best_params_


In [None]:
# Make prediction
testing_data["predicted"] = players_gridsearch.predict(
    testing_data[["age", "experience_num"]]
)

# Compute RMSPE
RMSPE = mean_squared_error(
    y_true=testing_data["played_hours"],
    y_pred=testing_data["predicted"]
)**0.5

RMSPE
