In [1]:
### Run this cell before continuing.
import altair as alt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsRegressor

# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [2]:
url = "https://drive.google.com/uc?export=download&id=1Mw9vW0hjTJwRWx0bDXiSpYsO3gKogaPz"
players = pd.read_csv(url)
players

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [3]:
players_filtered = players[["age", "played_hours", "experience"]]
players_filtered

Unnamed: 0,age,played_hours,experience
0,9,30.3,Pro
1,17,3.8,Veteran
2,17,0.0,Veteran
3,21,0.7,Amateur
4,21,0.1,Regular
...,...,...,...
191,17,0.0,Amateur
192,22,0.3,Veteran
193,17,0.0,Amateur
194,17,2.3,Amateur


In [4]:
experience_map = {
    "Amateur": 1,
    "Beginner": 2,
    "Regular": 3,
    "Pro": 4,
    "Veteran": 5
}

players_filtered = players_filtered.assign(
    experience_num = players_filtered['experience'].map(experience_map)
)

players_filtered

Unnamed: 0,age,played_hours,experience,experience_num
0,9,30.3,Pro,4
1,17,3.8,Veteran,5
2,17,0.0,Veteran,5
3,21,0.7,Amateur,1
4,21,0.1,Regular,3
...,...,...,...,...
191,17,0.0,Amateur,1
192,22,0.3,Veteran,5
193,17,0.0,Amateur,1
194,17,2.3,Amateur,1


In [5]:
players_filtered = players_filtered.assign(
    playtime_grouped=pd.cut(
        players_filtered["played_hours"],
        bins=[0, 1, 5, 10, 15, 20, 50, 100, 200, 223.1], 
        labels=["<1", "1–5", "5–10", "10–15", "15-20", "20–50", "50–100", "100-200", "200>"]
    )
)

players_plot = alt.Chart(players_filtered, title = "Playtime in Hours: Player age and Experience Level").mark_circle(size=40).encode(
    x=alt.X("age:Q").title("Age (years)"),
    y=alt.Y("experience_num:O", 
            title = "Experience (numbered low to high)",
            scale=alt.Scale(reverse=True)
), color=alt.Color("playtime_grouped:N", sort = ["<1", "1–5", "5–10", "10–15", "15-20", "20–50", "50–100", "100-200", "200>"]).scale(scheme="paired").title("Playtime (hours, grouped)")
).properties(width=850, height=200)
players_plot

In [6]:
#split filtered data into training and testing with 70/30 split using random_state = 2000
training_data, testing_data = train_test_split(
    players_filtered, test_size = 0.3, random_state = 2000
)

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline


# Standardize age and experience_num 
players_preprocessor = make_column_transformer(
    (StandardScaler(), ["age", "experience_num"]),
    remainder="drop"
)

players_pipeline = make_pipeline(
    players_preprocessor,
    KNeighborsRegressor()
)


In [8]:
# GRID SEARCH (1–75 neighbors, step = 3) with 5-fold CV

param_grid = {
    "kneighborsregressor__n_neighbors": range(1, 76, 3)
}

players_gridsearch = GridSearchCV(
    estimator=players_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    return_train_score=False
)
print(players_gridsearch)
# Fit on training data

players_gridsearch.fit(
    training_data[["age", "experience_num"]],   
    training_data["played_hours"]              
)
print(players_gridsearch)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('standardscaler',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'experience_num'])])),
                                       ('kneighborsregressor',
                                        KNeighborsRegressor())]),
             param_grid={'kneighborsregressor__n_neighbors': range(1, 76, 3)},
             scoring='neg_root_mean_squared_error')
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('standardscaler',
                                                                         StandardScaler(),
                                      

In [9]:
# Gather CV results 

cv_results = pd.DataFrame(players_gridsearch.cv_results_)

cv_results["sem_test_score"] = cv_results["std_test_score"] / (5**0.5)

cv_results = (
    cv_results[
        ["param_kneighborsregressor__n_neighbors",
         "mean_test_score",
         "sem_test_score"]
    ]
    .rename(columns={
        "param_kneighborsregressor__n_neighbors": "n_neighbors"
    })
)

cv_results["mean_test_score"] = -cv_results["mean_test_score"]

cv_results

Unnamed: 0,n_neighbors,mean_test_score,sem_test_score
0,1,28.452765,6.107046
1,4,27.058662,5.228597
2,7,22.474854,4.61685
3,10,23.987155,4.516612
4,13,23.67393,4.750189
5,16,22.51523,5.272415
6,19,22.360799,5.416053
7,22,22.070326,5.623141
8,25,21.705234,5.802995
9,28,21.739077,5.856564


In [10]:
players_gridsearch.best_params_

{'kneighborsregressor__n_neighbors': 73}

In [11]:
# Make prediction
testing_data["predicted"] = players_gridsearch.predict(
    testing_data[["age", "experience_num"]]
)

# Compute RMSPE
RMSPE = mean_squared_error(
    y_true=testing_data["played_hours"],
    y_pred=testing_data["predicted"]
)**0.5

RMSPE

np.float64(34.4122421618046)

In [12]:
#linear regression
X_train = training_data[["age", "experience_num"]]
y_train = training_data["played_hours"]
X_test = testing_data[["age", "experience_num"]]
y_test = testing_data["played_hours"]

lm = LinearRegression()
players_train_fit = lm.fit(X_train, y_train)
print(players_train_fit.coef_)
print(players_train_fit.intercept_)

[-0.08066138 -0.14859094]
6.937005449675615


In [13]:
#finding rmse
train_predictions = players_train_fit.predict(X_train)
lm_rmse = mean_squared_error(train_predictions, y_train)**0.5
print(lm_rmse)

25.137087062308986


In [14]:
#finding rmspe
test_predictions = players_train_fit.predict(X_test)
lm_rmspe = mean_squared_error(test_predictions, y_test)**0.5
print(lm_rmspe)

34.45033724343876


The RMSPE for the linear regression is lower than that of the KNN regression, making it a better option as a model.

**Option A: Doing a multivairable linear regression**

In [15]:
from sklearn import linear_model

mvr_X = players_filtered[["age", "experience_num"]]
mvr_y = players_filtered["played_hours"]

mvr_reg = linear_model.LinearRegression()
mvr_reg.fit(mvr_X, mvr_y)

In [16]:
# Finding the coefficient: a factor that describes the relationship with an unknown variable.
print(mvr_reg.coef_)

[-0.14368895 -0.81510904]


The coefficient value for:
* "age": -0.14368895
* "experience": -0.81510904

In [None]:
import matplotlib.plyplot as plt


In [83]:
#comparing the RMSE of "age" and "experience_num" by dropping one of each then running calculations
#Keep "age"
X_train = training_data[["age"]]
y_train = training_data["played_hours"]
X_test = testing_data[["age"]]
y_test = testing_data["played_hours"]

lm = LinearRegression()
players_train_fit = lm.fit(X_train, y_train)
print(players_train_fit.coef_)
print(players_train_fit.intercept_)

[-0.08189088]
6.568297635458251


In [84]:
#finding rmse - "age" (?)
train_predictions = players_train_fit.predict(X_train)
lm_rmse = mean_squared_error(train_predictions, y_train)**0.5
print(lm_rmse)

25.138164584672175


In [85]:
#finding rmspe - "age" 
test_predictions = players_train_fit.predict(X_test)
lm_rmspe = mean_squared_error(test_predictions, y_test)**0.5
print(lm_rmspe)

34.47298465952793


In [86]:
#Keep "experience_num"
X_train = training_data[["experience_num"]]
y_train = training_data["played_hours"]
X_test = testing_data[["experience_num"]]
y_test = testing_data["played_hours"]

lm = LinearRegression()
players_train_fit = lm.fit(X_train, y_train)
print(players_train_fit.coef_)
print(players_train_fit.intercept_)

[-0.16997097]
5.28225862367828


In [87]:
#finding rmse - "experience_num" (?)
train_predictions = players_train_fit.predict(X_train)
lm_rmse = mean_squared_error(train_predictions, y_train)**0.5
print(lm_rmse)

25.14725665296037


In [88]:
#finding rmspe - "experience_num"
test_predictions = players_train_fit.predict(X_test)
lm_rmspe = mean_squared_error(test_predictions, y_test)**0.5
print(lm_rmspe)

34.5072356223316


In [89]:
players_filtered = players_filtered.assign(
    playtime_grouped=pd.cut(
        players_filtered["played_hours"],
        bins=[0, 1, 5, 10, 15, 20, 50, 100, 200, 223.1], 
        labels=["<1", "1–5", "5–10", "10–15", "15-20", "20–50", "50–100", "100-200", "200>"]
    )
)

players_plot = alt.Chart(players_filtered, title = "Playtime in Hours: Player age and Experience Level").mark_circle(size=40).encode(
    x=alt.X("age:Q").title("Age (years)"),
    y=alt.Y("experience_num:O", 
            title = "Experience (numbered low to high)",
            scale=alt.Scale(reverse=True)
), color=alt.Color("playtime_grouped:N", sort = ["<1", "1–5", "5–10", "10–15", "15-20", "20–50", "50–100", "100-200", "200>"]).scale(scheme="paired").title("Playtime (hours, grouped)")
).properties(width=850, height=200)
players_plot

In [90]:
players_preds = training_data.assign(
    predictions = lm.predict(X_train)
)

players_scatter = alt.Chart(players_preds).mark_circle(size=40).encode(
    x=alt.X("age:Q").title("Age (years)"),
    y=alt.Y("experience_num:O", 
            title = "Experience (numbered low to high)",
            scale=alt.Scale(reverse=True)
), color=alt.Color("played_hours:N").scale(scheme="paired").title("Playtime (hours, grouped)")
).properties(width=850, height=200)

players_plot2 = players_scatter + alt.Chart(players_preds).mark_line(color="black").encode(
    x="age",
    y="experience_num"
)
players_plot2

In [91]:
players_scatter2 = alt.Chart(players_preds).mark_circle(size=40).encode(
    x=alt.X("age:Q").title("Age (years)"),
    y=alt.Y("played_hours:N", 
            title = "Playtime (hours)",
            scale=alt.Scale(reverse=True)
)).properties(width=850, height=800)


players_plot3 = players_scatter2 + alt.Chart(players_preds).mark_line(color="black").encode(
    x="age",
    y="experience_num"
)
players_plot3

#implement a linear reg?
