In [133]:
from collections import defaultdict
import re
import pandas as pd
import altair as alt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder

In [134]:
#pd.set_option('display.max_columns', 50)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

Load the data:

In [209]:
df = pd.read_csv('../data/train_data.zip')

In [210]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


Apply basic preprocessing:

In [233]:
# This implementation drops columns with survey answers

def preprocess_biba(input_data):
    """
    Given the original dataframe, process the columns related to
    Biba Playground Games. 
    
    Parameters
    ----------
    input_data: pandas.core.frame.DataFrame
    
    Returns
    -------
    output_data: pandas.core.frame.DataFrame
    
    """
    # Concatenate relevant columns into a single dataframe 
    
    biba_df = pd.DataFrame()
    biba_df = pd.concat([input_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'],
                         input_data.loc[:, 'historic_number_of_sessions':'historic_snow']], axis=1)
    
    
    # Extract categorical features
    categorical_features = biba_df.loc[:, biba_df.dtypes == "object"]
    
    # Identify categorical features and numerical features with high prop. of NaN values
    to_drop = categorical_features.columns.to_list()
    
    to_drop += ['historic_hour_0', 'historic_hour_23', 'historic_hour_22', 'historic_hour_21',
                'historic_hour_7','historic_hour_6','historic_hour_5','historic_hour_4', 
                'historic_hour_3','historic_hour_2','historic_hour_1', 'MonthYear']
    
    # Drop said columns
    biba_df = biba_df.drop(columns=to_drop)
    
    # Impute any remaining NaN values with 0
    biba_df = biba_df.fillna(0)
    
    # Remove the old, unprocessed colums in the input data 
    old_columns = input_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'].columns.to_list() +\
                  input_data.loc[:, 'historic_number_of_sessions':'historic_snow'].columns.to_list()
    
    input_data = input_data.drop(old_columns)
    
    # Add preprocessed columns back
    
    output_data = pd.concat([input_data, biba_df], axis=1)
    
    return output_data

In [237]:
def preprocess_weather(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to weather information (`Democrats_08_Votes` to
    the end + `climate`). Impute NaN of `Number_of_holidays`
    by using the values the we have for the same month,
    impute NaN of `Green_2016` by using values found online, or 0,
    and replace remaining NaN values with 0.
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    """
    df_weather = input_data.loc[:, 'Democrats_08_Votes':]
    df_weather['state'] = input_data['state']
    df_weather['climate'] = input_data['climate']
    df_weather['external_id'] = input_data['external_id']
    df_weather['month'] = input_data['month']
    df_weather['year'] = input_data['year']
    
    #fill up NaNs for `Number_of_holidays` column
    #I sorted the values so that the values are ordered by time, and the NaNs are at the end of each time period
    df_weather = df_weather.sort_values(['month', 'year', 'Number_of_holidays'])
    df_weather['Number_of_holidays'] = df_weather['Number_of_holidays'].fillna(method='ffill')
    
    #fill up NaNs for the `Green_2016` column
    #I only found values for Alaska and North Carolina, so I just put 0 for the other states
    df_weather['Green_2016'] = np.where(
     df_weather['state'] == 'Alaska', 5735,
         np.where(
            df_weather['state'] == 'North Carolina', 12105,
             np.where(
                df_weather['Green_2016'].isnull(), 0, df_weather['Green_2016']
             )
         )
    )
    
    df_weather['climate'] = df_weather['climate'].fillna(df_weather['climate'].mode()[0])
    
    #Substitute every remaining NaNs by 0
    df_weather = df_weather.fillna(value=0)
    output_data = input_data.copy()
    output_data.loc[:, 'Democrats_08_Votes':] = df_weather.loc[:, 'Democrats_08_Votes':]
    output_data['climate'] = df_weather['climate']
    
    return output_data

In [232]:
def preprocess_neighbour(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to locale information (`city` to
    `houses_per_sq_km`). Drop columns with >30%
    NaN values and replace remaining NaN values with 0.
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    """
    df_neighbour = input_data.loc[:, 'city':'houses_per_sq_km']
    df_neighbour.drop(columns=['climate'])
    missing = df_neighbour.isna()
    
    # Count number of missing values for each column
    num_missing = missing.sum().sort_values(ascending=False)
    
    # Calculate proportion of missing values for each column
    prop_missing = num_missing / df.shape[0]
    
    # Create a list of columns with >30% of values missing
    to_drop = prop_missing[prop_missing > 0.3].index.to_list()
    
    # Add `country` to the list since all playgrounds are in the U.S.
    # Add `city` and `county` since lat. and long. should take care of them
    to_drop.append('country')
    to_drop.append('city')
    to_drop.append('county')
    
    # Drop columns with names in list
    output_data = input_data.drop(to_drop, axis=1)
    
    # Fill in remaining NaN values in locale-related columns with 0
    to_impute = prop_missing[(0 < prop_missing) & (prop_missing <= 0.3)].index.to_list()
    to_impute.remove('city')
    to_impute.remove('county')
    
    output_data[to_impute] = output_data[to_impute].fillna(0)
    output_data['climate'] = input_data['climate']
    
    return output_data

In [238]:
data = biba_pp(df)
data = preprocess_weather(data)
data = preprocess_neighbour(data)

In [None]:
data.head()

Create `X` and `y`:

In [11]:
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']

Split the data into training and validation sets:

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

Fit a gradient boosting regressor:

Report training and validation errors:

Plot feature importance:

References:
- https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html