In [39]:
### Run this cell before continuing.

import pandas as pd
import altair as alt

In [45]:
## Clean the dataset, keeping on the predictor and response variables.

# Import raw player data into python from the CSV file
player_data_unclean = pd.read_csv('data/players.csv') 

# Drop unecessary columns 
player_data = player_data_unclean.drop(['name','hashedEmail','individualId','organizationName'], axis=1) 

# Assign all genders other than 'Male' or 'Female' to the 'Other' category
player_data.loc[~player_data['gender'].isin(['Male', 'Female']), 'gender'] = 'Other'

##Encode string and boolean predictor variables into numerical variables

# Map experience levels to integers using ordinal encoding
experience_map = {
    'Amateur': 0,
    'Beginner': 1,
    'Regular': 2,
    'Veteran': 3,
    'Pro': 4
}
player_data['experience_int'] = player_data['experience'].map(experience_map)

# Convert boolean subscription status to integers
player_data['subscribe_int'] = player_data['subscribe'].astype(int)

# One hot encode the gender variable to create numerical dummy variables
player_data = pd.get_dummies(player_data , columns=['gender'], drop_first=True)

# Drop the original string columns that have already been encoded
player_data = player_data.drop(columns=['experience', 'subscribe'])

# Ensure one-hot encoded gender columns are stored as integers to prevent future confusion
player_data['gender_Male'] = player_data['gender_Male'].astype(int)
player_data['gender_Other'] = player_data['gender_Other'].astype(int)

player_data

Unnamed: 0,played_hours,age,experience_int,subscribe_int,gender_Male,gender_Other
0,30.3,9,4,1,1,0
1,3.8,17,3,1,1,0
2,0.0,17,3,0,1,0
3,0.7,21,0,1,0,0
4,0.1,21,2,1,1,0
...,...,...,...,...,...,...
191,0.0,17,0,1,0,0
192,0.3,22,3,0,1,0
193,0.0,17,0,0,0,1
194,2.3,17,0,0,1,0
