https://github.com/sherisezhang/project_individual.git

# Load package and import data

In [1]:
import pandas as pd
import altair as alt

players = pd.read_csv("players.csv")
players

Unnamed: 0,experience,subscribe,hashedEmail,played_hours,name,gender,age,individualId,organizationName
0,Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6...,30.3,Morgan,Male,9,,
1,Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa9397...,3.8,Christian,Male,17,,
2,Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3...,0.0,Blake,Male,17,,
3,Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4f...,0.7,Flora,Female,21,,
4,Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb...,0.1,Kylie,Male,21,,
...,...,...,...,...,...,...,...,...,...
191,Amateur,True,b6e9e593b9ec51c5e335457341c324c34a2239531e1890...,0.0,Bailey,Female,17,,
192,Veteran,False,71453e425f07d10da4fa2b349c83e73ccdf0fb3312f778...,0.3,Pascal,Male,22,,
193,Amateur,False,d572f391d452b76ea2d7e5e53a3d38bfd7499c7399db29...,0.0,Dylan,Prefer not to say,17,,
194,Amateur,False,f19e136ddde68f365afc860c725ccff54307dedd13968e...,2.3,Harlow,Male,17,,


In [2]:
players[['experience']].drop_duplicates()

Unnamed: 0,experience
0,Pro
1,Veteran
3,Amateur
4,Regular
12,Beginner


In [3]:
players[['played_hours', 'age']].agg(['max', 'min', 'mean'])

Unnamed: 0,played_hours,age
max,223.1,99.0
min,0.0,8.0
mean,5.845918,21.280612


# Data Description

## I will use the "players.csv" dataset (196 rows,9 columns)
### Nine variables are:
+ 'experience'(categorical): player experience level, one of {Pro, Veteran, Amateur, Regular, Beginner}.
+ 'subscribe'(boolean): whether or not the player has subscribed the game newsletter, and this will be the response variable.
+ 'hashedEmail'(identifier): this records the hashing encoded email addresses of players.
+ 'played_hours'(numeric): total number of hours the player has spent in the game.
+ 'name'(identifier): player name. Not useful for modelling.
+ 'gender'(categorical): gender of the player.
+ 'age'(numeric): age of the player in years.

In [4]:
players['gender'].value_counts()

gender
Male                 124
Female                37
Non-binary            15
Prefer not to say     11
Two-Spirited           6
Agender                2
Other                  1
Name: count, dtype: int64

# Potential issues
1. The 'hashedEmail' column contains anonymized identifiers, which can't be interpreted as a meaningful predictor, so it will be removed.
2. Class imbalance is present in the target variable('subscribe': 144 True, 52 False).
3. From the table above, the quantities of 'Agender', 'Other', and 'Two-Spirited' are very small compared to 'Male' and 'Female'. To avoid instability when fitting a KNN model, maybe we can merge them into a single level called 'others'.
4. If we want to used experience as a predictor, it is not numerical, we need to process it first, one-hot encoding is an option.

In [7]:
players = players.drop(columns = ['individual', 'organizationName', 'name', 'hashedEmail'], errors='ignore')
players

Unnamed: 0,experience,subscribe,played_hours,gender,age,individualId
0,Pro,True,30.3,Male,9,
1,Veteran,True,3.8,Male,17,
2,Veteran,False,0.0,Male,17,
3,Amateur,True,0.7,Female,21,
4,Regular,True,0.1,Male,21,
...,...,...,...,...,...,...
191,Amateur,True,0.0,Female,17,
192,Veteran,False,0.3,Male,22,
193,Amateur,False,0.0,Other,17,
194,Amateur,False,2.3,Male,17,


In [8]:
players['gender'] = players['gender'].replace({
    'Agender',
    'Two-Spirited',
    'Prefer not to say',
    'Other'}, 'Other')
players

Unnamed: 0,experience,subscribe,played_hours,gender,age,individualId
0,Pro,True,30.3,Male,9,
1,Veteran,True,3.8,Male,17,
2,Veteran,False,0.0,Male,17,
3,Amateur,True,0.7,Female,21,
4,Regular,True,0.1,Male,21,
...,...,...,...,...,...,...
191,Amateur,True,0.0,Female,17,
192,Veteran,False,0.3,Male,22,
193,Amateur,False,0.0,Other,17,
194,Amateur,False,2.3,Male,17,
