In [172]:
from collections import defaultdict
import re
import time
import pandas as pd
import altair as alt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [12]:
#pd.set_option('display.max_columns', 50)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

Load the data:

In [82]:
df = pd.read_csv('../data/train_data.zip')

In [83]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


Apply basic preprocessing:

In [84]:
def dict_to_columns_df(col, key, val):
    """
    This functions takes a dataframe column which is in the
    form of list of dictionaries and creates a dataframe
    from the keys of the in the inner list of dictionaries 
    e.g. "[{'key': A, 'val': 1}, {'key': B, 'val': 2}]"
    
    Parameters
    ----------------
    col : DataFrame Series, the columns whose values are the in the format
    of a list of dictionaries.
    
    key : the keys in the inner dictionary from which column names are to be extracted
    
    val : the keys in the inner dictionary from which values in the column needs to
    be extracted
    
    
    Returns
    ----------------
    DataFrame
        With the new columns created from the keys of the inner dictionary
        
    """
    key_list = set()
    i=0
    # getting all the new column names
    while i < len(col):
        if type(col[i]) != float:
            dic_list = eval(col[i]) #converting col value from string to list
            for dic in range(len(dic_list)):
                if re.match('[a-zA-Z]', dic_list[dic][str(key)][0]): #removing spanish names
                    key_list.add("monthly_"+dic_list[dic][str(key)])
        i+=1
    
    all_cols_dict = defaultdict(list)
    
    i = 0
    while i < len(col):
        if type(col[i]) != float:
            dic_list = eval(col[i]) #converting col value from string to list

            for col_names in list(key_list):
                flag = 0 #to check if a column name exists in the dictionary
                for dic in range(len(dic_list)):
                    if dic_list[dic][str(key)] == col_names[8:]: #getting values from the inner dictionary matching the key
                        all_cols_dict[col_names].append(dic_list[dic][str(val)]) #putting inner dict values to new default dict
                        flag = 1
                        break
                
                if flag==0:
                    all_cols_dict[col_names].append(None)

        else:
            for col_names in list(key_list):
                all_cols_dict[col_names].append(None)

        i+=1
    new_cols_df = pd.DataFrame(all_cols_dict)
    
    # checking new df has same number of columns as given column
    if new_cols_df.shape[0] == col.shape[0]:
        return new_cols_df
    else:
        print("Column dimensions don't match")

In [85]:
def preprocess_biba(full_data):  
    
    """
    Performs the pre-processing of the columns for the biba data
    
    Paramters
    ---------------
    
    full_data : DataFrame, with no operations done on the biba columns
    
    Returns
    ---------------
    DataFrame
        with processed biba columns
    
    """
    biba_games_df = pd.DataFrame()
    biba_games_df = pd.concat([full_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'],
                               full_data.loc[:, 'historic_number_of_sessions':'historic_snow']], axis = 1)
    
    #extracting categorical features
    categorical_features = biba_games_df.loc[:, biba_games_df.dtypes == "object"]
     
    # creating cols from list of dictionaries
    monthly_survey_df = dict_to_columns_df(categorical_features['monthly_survey'], 'question', 'avg_answer')
    monthly_weekday_counts_df = dict_to_columns_df(categorical_features['monthly_weekday_counts'], 'weekday', 'count')
    
    biba_games_df = pd.concat([biba_games_df, monthly_survey_df, monthly_weekday_counts_df], axis = 1)
    
    #dropping categorical features
    biba_games_df = biba_games_df.drop(columns = list(categorical_features.columns))
    
    #dropping historic hours with low fill rate
    numerical_cols_to_remove = ['historic_hour_0', 'historic_hour_23', 'historic_hour_22', 'historic_hour_21',
                                'historic_hour_7','historic_hour_6','historic_hour_5','historic_hour_4', 
                                'historic_hour_3','historic_hour_2','historic_hour_1', 'MonthYear']
    
    biba_games_df = biba_games_df.drop(columns = numerical_cols_to_remove)
    
    impute_biba_games_df =  biba_games_df.fillna(0)
    
    #removing the previous columns in the input data
    cols_to_drop = list(df.loc[:, 'monthly_number_of_sessions': 'distance_to_nearest_bus_stop'].columns) +\
                    list(df.loc[:, 'historic_number_of_sessions' : 'historic_snow'].columns)
    
    
    full_data = full_data.drop(columns = cols_to_drop)
    
    #adding processed columns
    full_data = pd.concat([full_data, impute_biba_games_df], axis = 1)
    
    return full_data

In [86]:
# This implementation drops columns with survey answers

def preprocess_biba_no_survey(input_data):
    """
    Given the original dataframe, process the columns related to
    Biba Playground Games. 
    
    Parameters
    ----------
    input_data: pandas.core.frame.DataFrame
    
    Returns
    -------
    output_data: pandas.core.frame.DataFrame
    
    """
    # Concatenate relevant columns into a single dataframe 
    
    biba_df = pd.DataFrame()
    biba_df = pd.concat([input_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'],
                         input_data.loc[:, 'historic_number_of_sessions':'historic_snow']], axis=1)
    
    
    # Extract categorical features
    categorical_features = biba_df.loc[:, biba_df.dtypes == "object"]
    
    # Identify categorical features and numerical features with high prop. of NaN values
    to_drop = categorical_features.columns.to_list()
    
    to_drop += ['historic_hour_0', 'historic_hour_23', 'historic_hour_22', 'historic_hour_21',
                'historic_hour_7','historic_hour_6','historic_hour_5','historic_hour_4', 
                'historic_hour_3','historic_hour_2','historic_hour_1', 'MonthYear']
    
    # Drop said columns
    biba_df = biba_df.drop(columns=to_drop)
    
    # Impute any remaining NaN values with 0
    biba_df = biba_df.fillna(0)
    
    # Remove the old, unprocessed colums in the input data 
    old_columns = input_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'].columns.to_list() +\
                  input_data.loc[:, 'historic_number_of_sessions':'historic_snow'].columns.to_list()
    
    input_data = input_data.drop(old_columns)
    
    # Add preprocessed columns back
    
    output_data = pd.concat([input_data, biba_df], axis=1)
    
    return output_data

In [87]:
def preprocess_weather(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to weather information (`Democrats_08_Votes` to
    the end + `climate`). Impute NaN of `Number_of_holidays` 
    by using the values the we have for the same month,
    impute NaN of `Green_2016` by using values found online, or 0, 
    and replace remaining NaN values with 0.
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    df_weather = input_data.loc[:, 'Democrats_08_Votes':]
    df_weather['state'] = input_data['state']
    df_weather['climate'] = input_data['climate']
    df_weather['external_id'] = input_data['external_id']
    df_weather['month'] = input_data['month']
    df_weather['year'] = input_data['year']
    
    
    #fill up NaNs for `Number_of_holidays` column
    #I sorted the values so that the values are ordered by time, and the NaNs are at the end of each time period
    df_weather = df_weather.sort_values(['month', 'year', 'Number_of_holidays'])
    df_weather['Number_of_holidays'] = df_weather['Number_of_holidays'].fillna(method='ffill')
    
    #fill up NaNs for the `Green_2016` column
    #I only found values for Alaska and North Carolina, so I just put 0 for the other states
    df_weather['Green_2016'] = np.where(
     df_weather['state'] == 'Alaska', 5735, 
         np.where(
            df_weather['state'] == 'North Carolina', 12105,  
             np.where(
                df_weather['Green_2016'].isnull(), 0, df_weather['Green_2016'] 
             )
         )
    )
    
    df_weather['climate'] = df_weather['climate'].fillna(df_weather['climate'].mode()[0])
    
    #Substitute every remaining NaNs by 0
    df_weather = df_weather.fillna(value=0)
    
    output_data = input_data.copy()
    output_data.loc[:, 'Democrats_08_Votes':] = df_weather.loc[:, 'Democrats_08_Votes':]
    output_data['climate'] = df_weather['climate']
    
    #Tests
    
    #Check that there are no missing values in the `Number_of_holidays` column
    if not output_data['Number_of_holidays'].isnull().sum() == 0:
        raise Error('There should not be NaNs in the Number_of_holidays column')
    
    #Check that every month has only one value for the `Number_of_holiday` column
    number_of_error = 0
    for month in range(12):
        for year in [2018, 2019]:
            sub_df = output_data[(output_data['month'] == month+1) & (output_data['year'] == year)]
            if len(sub_df['Number_of_holidays'].unique()) > 1:
                number_of_error += 1 
    if not number_of_error == 0:
        raise Error('Every month should have the same value for Number_of_holidays')
               
    return output_data

In [88]:
def preprocess_neighbour(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to locale information (`city` to
    `houses_per_sq_km`). Drop columns with >30%
    NaN values and replace remaining NaN values with 0.
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    """
    
    df_neighbour = input_data.loc[:, 'city':'houses_per_sq_km']
    df_neighbour.drop(columns=['climate'])
    missing = df_neighbour.isna()
    
    # Count number of missing values for each column
    num_missing = missing.sum().sort_values(ascending=False)
    
    # Calculate proportion of missing values for each column
    prop_missing = num_missing / df.shape[0]
    
    # Create a list of columns with >30% of values missing
    to_drop = prop_missing[prop_missing > 0.3].index.to_list()
    
    # Add `country` to the list since all playgrounds are in the U.S.
    # Add `city` and `county` since lat. and long. should take care of them
    to_drop.append('country')
    to_drop.append('city')
    to_drop.append('county')
    
    # Drop columns with names in list
    output_data = input_data.drop(to_drop, axis=1)
    
    # Fill in remaining NaN values in locale-related columns with 0
    to_impute = prop_missing[(0 < prop_missing) & (prop_missing <= 0.3)].index.to_list()
    to_impute.remove('city')
    to_impute.remove('county')
    output_data[to_impute] = output_data[to_impute].fillna(0)
    output_data['climate'] = input_data['climate']

    return output_data

In [89]:
data = preprocess_biba(df)
data = preprocess_weather(data)
data = preprocess_neighbour(data)

In [90]:
# Check shape of ouput data
data.head()

Unnamed: 0,external_id,month,year,B20004e10,B11016e1,B12001e12,B20004e11,B19125e1,B12001e13,B23008e22,...,monthly_safety,monthly_variety,monthly_condition,monthly_Wednesday,monthly_Thursday,monthly_Friday,monthly_Saturday,monthly_Monday,monthly_Sunday,monthly_Tuesday
0,1900203,3,2019,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1900203,6,2018,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1900203,8,2018,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MR00101775,1,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MR00101775,8,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
def clean_categorical(input_data, to_drop=['income_class', 'density_class', 'climate']):
    """
    Given the original dataframe, uses One-Hot-Encoding to encode the categorical variables
    
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    to_drop : list
        The list of the categorical variables on which we want to apply OHE
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    output_data = input_data.copy()

    #Apply One-Hot-Encoding to each one of the categorical variable
    for col in to_drop:
        ohe = OneHotEncoder(sparse=False, dtype=int)
        sub_df = pd.DataFrame(ohe.fit_transform(input_data[[col]]), columns=ohe.categories_[0])
        output_data = pd.concat((output_data, sub_df), axis=1)
    #Drop the columns for which we used OHE
    output_data.drop(columns = to_drop, inplace=True)
    
    return output_data

In [92]:
# Perform one-hot encoding on three columns:
data = clean_categorical(data)

In [93]:
# Check the shape of the dataframe after all pre-processing
data.head()

Unnamed: 0,external_id,month,year,B20004e10,B11016e1,B12001e12,B20004e11,B19125e1,B12001e13,B23008e22,...,monthly_Tuesday,HI,LI,MI,HD,LD,MD,A,C,D
0,1900203,3,2019,51111,1868,688,0,78934,1342,0,...,0.0,1,0,0,1,0,0,0,1,0
1,1900203,6,2018,51111,1868,688,0,78934,1342,0,...,0.0,1,0,0,1,0,0,0,1,0
2,1900203,8,2018,51111,1868,688,0,78934,1342,0,...,0.0,1,0,0,1,0,0,0,1,0
3,MR00101775,1,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0,1,0,0,1,0,0,1,0
4,MR00101775,8,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0,1,0,0,1,0,0,1,0


Create `X` and `y`:

In [73]:
X = data.drop('unacast_session_count', axis=1)
y = data.loc[:, 'unacast_session_count']

In [98]:
# For now, drop `external_id` and `state`
X = X.drop(['external_id', 'state'], axis=1)

In [74]:
# Check if there are missing values in X
X.isna().sum().sort_values(ascending=False)

days_since_first_sess    17800
D                            0
B12001e2                     0
B11005e9                     0
B13016e5                     0
                         ...  
precip_mm_10_above           0
precip_mm_1_10               0
precip_mm_0_1                0
precip_mm_none               0
external_id                  0
Length: 820, dtype: int64

In [101]:
# For now, fill `days_since_first_sess` with 0
X['days_since_first_sess'] = X['days_since_first_sess'].fillna(0)

In [102]:
# Check if there are missing values in y
y.isna().sum()

0

No `NaN` values in `y` - that's good.

Split the data into training and validation sets:

In [103]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [105]:
X_train.shape

(40096, 818)

In [106]:
X_valid.shape

(10024, 818)

Fit a `GradientBoostingRegressor` with default settings:

In [151]:
params = {'verbose': 1,
          'random_state': 2020}

In [152]:
gbr = GradientBoostingRegressor(**params)
gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1      213669.2304            4.86m
         2      205282.8246            4.79m
         3      198390.9645            4.72m
         4      191658.6878            5.01m
         5      186589.0004            4.92m
         6      181186.5017            4.90m
         7      177208.5001            4.84m
         8      174242.0719            5.04m
         9      171638.3291            5.22m
        10      165022.6046            5.32m
        20      129466.3909            4.65m
        30       99860.7850            3.98m
        40       81710.4135            3.35m
        50       70320.0408            2.81m
        60       62436.7513            2.22m
        70       58201.8945            1.67m
        80       53252.5016            1.11m
        90       49244.3230           33.05s
       100       46957.9923            0.00s


In [170]:
# Calculate MSE
y_pred = gbr.predict(X_valid)
mean_squared_error(y_valid, y_pred)

370204.9588049915

In [153]:
# Calculate R^2 of the prediction
gbr.score(X_valid, y_valid)

0.27425744561888155

Create another `GradientBoostingRegressor` where `n_estimators` is doubled.

In [154]:
params_1000 = {'verbose': 1,
               'n_estimators': 1000,
               'random_state': 2020}

In [155]:
gbr_1000 = GradientBoostingRegressor(**params_1000)

t0 = time.time()
gbr_1000.fit(X_train, y_train)
t1 = time.time()
fit_time = t1 - t0

      Iter       Train Loss   Remaining Time 
         1      213669.2304           48.80m
         2      205282.8246           47.63m
         3      198390.9645           47.96m
         4      191658.6878           49.80m
         5      186589.0004           53.81m
         6      181186.5017           53.67m
         7      177208.5001           53.37m
         8      174242.0719           53.11m
         9      171638.3291           52.80m
        10      165022.6046           52.90m
        20      129466.3909           50.69m
        30       99860.7850           51.99m
        40       81710.4135           51.96m
        50       70320.0408           51.28m
        60       62436.7513           53.76m
        70       58201.8945           53.96m
        80       53252.5016           52.57m
        90       49244.3230           50.99m
       100       46957.9923           49.76m
       200       33108.7092           43.56m
       300       26537.4012           37.29m
       40

In [162]:
# Calculate R^2 of the prediction
gbr_1000.score(X_valid, y_valid)

0.2861276364206973

In [171]:
y_pred_1000 = gbr_1000.predict(X_valid)
mean_squared_error(y_valid, y_pred_1000)

364149.91425749223

Perform randomized search of optimal hyperparameters:

In [175]:
param_grid = {'min_samples_split': [2, 4, 6],
              'max_depth': [3, 5, 7, 9],
              'max_features': ['auto', 'sqrt']}

In [None]:
gbr_gs = GradientBoostingRegressor(n_estimators=1000, verbose=1, random_state=2020)

rscv = RandomizedSearchCV(gbr_gs, param_grid, n_iter=3, verbose=2, n_jobs=1, random_state=2020)

search = rscv.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] min_samples_split=4, max_features=auto, max_depth=3 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


      Iter       Train Loss   Remaining Time 
         1      175382.1518           93.84m
         2      167979.2856          100.94m
         3      159780.7409           98.65m
         4      152920.9069           99.50m
         5      148649.0199          100.39m
         6      143598.0615          102.15m
         7      139962.7636          100.08m
         8      137345.5012           98.46m
         9      135097.3866           97.04m
        10      131214.4945           96.25m
        20       98106.9549           91.97m
        30       80402.1899           91.15m
        40       67393.8868           89.61m
        50       59352.0962           88.59m
        60       54595.4692           86.59m


Plot feature importance:

References:
- https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
- https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
- http://www.chengli.io/tutorials/gradient_boosting.pdf