In [1]:
from collections import defaultdict
import re
import time
import pandas as pd
import altair as alt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [2]:
#pd.set_option('display.max_columns', 50)
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

Load the data:

In [3]:
df = pd.read_csv('../data/train_data.zip')

In [4]:
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


Apply basic preprocessing:

In [5]:
def dict_to_columns_df(col, key, val):
    """
    This functions takes a dataframe column which is in the
    form of list of dictionaries and creates a dataframe
    from the keys of the in the inner list of dictionaries 
    e.g. "[{'key': A, 'val': 1}, {'key': B, 'val': 2}]"
    
    Parameters
    ----------------
    col : DataFrame Series, the columns whose values are the in the format
    of a list of dictionaries.
    
    key : the keys in the inner dictionary from which column names are to be extracted
    
    val : the keys in the inner dictionary from which values in the column needs to
    be extracted
    
    
    Returns
    ----------------
    DataFrame
        With the new columns created from the keys of the inner dictionary
        
    """
    key_list = set()
    i=0
    # getting all the new column names
    while i < len(col):
        if type(col[i]) != float:
            dic_list = eval(col[i]) #converting col value from string to list
            for dic in range(len(dic_list)):
                if re.match('[a-zA-Z]', dic_list[dic][str(key)][0]): #removing spanish names
                    key_list.add("monthly_"+dic_list[dic][str(key)])
        i+=1
    
    all_cols_dict = defaultdict(list)
    
    i = 0
    while i < len(col):
        if type(col[i]) != float:
            dic_list = eval(col[i]) #converting col value from string to list

            for col_names in list(key_list):
                flag = 0 #to check if a column name exists in the dictionary
                for dic in range(len(dic_list)):
                    if dic_list[dic][str(key)] == col_names[8:]: #getting values from the inner dictionary matching the key
                        all_cols_dict[col_names].append(dic_list[dic][str(val)]) #putting inner dict values to new default dict
                        flag = 1
                        break
                
                if flag==0:
                    all_cols_dict[col_names].append(None)

        else:
            for col_names in list(key_list):
                all_cols_dict[col_names].append(None)

        i+=1
    new_cols_df = pd.DataFrame(all_cols_dict)
    
    # checking new df has same number of columns as given column
    if new_cols_df.shape[0] == col.shape[0]:
        return new_cols_df
    else:
        print("Column dimensions don't match")

In [6]:
def preprocess_biba(full_data):  
    
    """
    Performs the pre-processing of the columns for the biba data
    
    Paramters
    ---------------
    
    full_data : DataFrame, with no operations done on the biba columns
    
    Returns
    ---------------
    DataFrame
        with processed biba columns
    
    """
    biba_games_df = pd.DataFrame()
    biba_games_df = pd.concat([full_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'],
                               full_data.loc[:, 'historic_number_of_sessions':'historic_snow']], axis = 1)
    
    #extracting categorical features
    categorical_features = biba_games_df.loc[:, biba_games_df.dtypes == "object"]
     
    # creating cols from list of dictionaries
    monthly_survey_df = dict_to_columns_df(categorical_features['monthly_survey'], 'question', 'avg_answer')
    monthly_weekday_counts_df = dict_to_columns_df(categorical_features['monthly_weekday_counts'], 'weekday', 'count')
    
    biba_games_df = pd.concat([biba_games_df, monthly_survey_df, monthly_weekday_counts_df], axis = 1)
    
    #dropping categorical features
    biba_games_df = biba_games_df.drop(columns = list(categorical_features.columns))
    
    #dropping historic hours with low fill rate
    numerical_cols_to_remove = ['historic_hour_0', 'historic_hour_23', 'historic_hour_22', 'historic_hour_21',
                                'historic_hour_7','historic_hour_6','historic_hour_5','historic_hour_4', 
                                'historic_hour_3','historic_hour_2','historic_hour_1', 'MonthYear']
    
    biba_games_df = biba_games_df.drop(columns = numerical_cols_to_remove)
    
    impute_biba_games_df =  biba_games_df.fillna(0)
    
    #removing the previous columns in the input data
    cols_to_drop = list(df.loc[:, 'monthly_number_of_sessions': 'distance_to_nearest_bus_stop'].columns) +\
                    list(df.loc[:, 'historic_number_of_sessions' : 'historic_snow'].columns)
    
    
    full_data = full_data.drop(columns = cols_to_drop)
    
    #adding processed columns
    full_data = pd.concat([full_data, impute_biba_games_df], axis = 1)
    
    return full_data

In [7]:
# This implementation drops columns with survey answers

def preprocess_biba_no_survey(input_data):
    """
    Given the original dataframe, process the columns related to
    Biba Playground Games. 
    
    Parameters
    ----------
    input_data: pandas.core.frame.DataFrame
    
    Returns
    -------
    output_data: pandas.core.frame.DataFrame
    
    """
    # Concatenate relevant columns into a single dataframe 
    
    biba_df = pd.DataFrame()
    biba_df = pd.concat([input_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'],
                         input_data.loc[:, 'historic_number_of_sessions':'historic_snow']], axis=1)
    
    
    # Extract categorical features
    categorical_features = biba_df.loc[:, biba_df.dtypes == "object"]
    
    # Identify categorical features and numerical features with high prop. of NaN values
    to_drop = categorical_features.columns.to_list()
    
    to_drop += ['historic_hour_0', 'historic_hour_23', 'historic_hour_22', 'historic_hour_21',
                'historic_hour_7','historic_hour_6','historic_hour_5','historic_hour_4', 
                'historic_hour_3','historic_hour_2','historic_hour_1', 'MonthYear']
    
    # Drop said columns
    biba_df = biba_df.drop(columns=to_drop)
    
    # Impute any remaining NaN values with 0
    biba_df = biba_df.fillna(0)
    
    # Remove the old, unprocessed colums in the input data 
    old_columns = input_data.loc[:, 'monthly_number_of_sessions':'distance_to_nearest_bus_stop'].columns.to_list() +\
                  input_data.loc[:, 'historic_number_of_sessions':'historic_snow'].columns.to_list()
    
    input_data = input_data.drop(old_columns)
    
    # Add preprocessed columns back
    
    output_data = pd.concat([input_data, biba_df], axis=1)
    
    return output_data

In [8]:
def preprocess_weather(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to weather information (`Democrats_08_Votes` to
    the end + `climate`). Impute NaN of `Number_of_holidays` 
    by using the values the we have for the same month,
    impute NaN of `Green_2016` by using values found online, or 0, 
    and replace remaining NaN values with 0.
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    df_weather = input_data.loc[:, 'Democrats_08_Votes':]
    df_weather['state'] = input_data['state']
    df_weather['climate'] = input_data['climate']
    df_weather['external_id'] = input_data['external_id']
    df_weather['month'] = input_data['month']
    df_weather['year'] = input_data['year']
    
    
    #fill up NaNs for `Number_of_holidays` column
    #I sorted the values so that the values are ordered by time, and the NaNs are at the end of each time period
    df_weather = df_weather.sort_values(['month', 'year', 'Number_of_holidays'])
    df_weather['Number_of_holidays'] = df_weather['Number_of_holidays'].fillna(method='ffill')
    
    #fill up NaNs for the `Green_2016` column
    #I only found values for Alaska and North Carolina, so I just put 0 for the other states
    df_weather['Green_2016'] = np.where(
     df_weather['state'] == 'Alaska', 5735, 
         np.where(
            df_weather['state'] == 'North Carolina', 12105,  
             np.where(
                df_weather['Green_2016'].isnull(), 0, df_weather['Green_2016'] 
             )
         )
    )
    
    df_weather['climate'] = df_weather['climate'].fillna(df_weather['climate'].mode()[0])
    
    #Substitute every remaining NaNs by 0
    df_weather = df_weather.fillna(value=0)
    
    output_data = input_data.copy()
    output_data.loc[:, 'Democrats_08_Votes':] = df_weather.loc[:, 'Democrats_08_Votes':]
    output_data['climate'] = df_weather['climate']
    
    #Tests
    
    #Check that there are no missing values in the `Number_of_holidays` column
    if not output_data['Number_of_holidays'].isnull().sum() == 0:
        raise Error('There should not be NaNs in the Number_of_holidays column')
    
    #Check that every month has only one value for the `Number_of_holiday` column
    number_of_error = 0
    for month in range(12):
        for year in [2018, 2019]:
            sub_df = output_data[(output_data['month'] == month+1) & (output_data['year'] == year)]
            if len(sub_df['Number_of_holidays'].unique()) > 1:
                number_of_error += 1 
    if not number_of_error == 0:
        raise Error('Every month should have the same value for Number_of_holidays')
               
    return output_data

In [9]:
def preprocess_neighbour(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to locale information (`city` to
    `houses_per_sq_km`). Drop columns with >30%
    NaN values and replace remaining NaN values with 0.
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    """
    
    df_neighbour = input_data.loc[:, 'city':'houses_per_sq_km']
    df_neighbour.drop(columns=['climate'])
    missing = df_neighbour.isna()
    
    # Count number of missing values for each column
    num_missing = missing.sum().sort_values(ascending=False)
    
    # Calculate proportion of missing values for each column
    prop_missing = num_missing / df.shape[0]
    
    # Create a list of columns with >30% of values missing
    to_drop = prop_missing[prop_missing > 0.3].index.to_list()
    
    # Add `country` to the list since all playgrounds are in the U.S.
    # Add `city` and `county` since lat. and long. should take care of them
    to_drop.append('country')
    to_drop.append('city')
    to_drop.append('county')
    
    # Drop columns with names in list
    output_data = input_data.drop(to_drop, axis=1)
    
    # Fill in remaining NaN values in locale-related columns with 0
    to_impute = prop_missing[(0 < prop_missing) & (prop_missing <= 0.3)].index.to_list()
    to_impute.remove('city')
    to_impute.remove('county')
    output_data[to_impute] = output_data[to_impute].fillna(0)
    output_data['climate'] = input_data['climate']

    return output_data

In [10]:
data = preprocess_biba(df)
data = preprocess_weather(data)
data = preprocess_neighbour(data)

In [11]:
# Check shape of ouput data
data.head()

Unnamed: 0,external_id,month,year,B20004e10,B11016e1,B12001e12,B20004e11,B19125e1,B12001e13,B23008e22,...,monthly_allages,monthly_travel,monthly_accessible,monthly_Saturday,monthly_Thursday,monthly_Sunday,monthly_Tuesday,monthly_Friday,monthly_Wednesday,monthly_Monday
0,1900203,3,2019,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1900203,6,2018,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1900203,8,2018,51111,1868,688,0,78934,1342,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MR00101775,1,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MR00101775,8,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
def clean_categorical(input_data, to_drop=['income_class', 'density_class', 'climate']):
    """
    Given the original dataframe, uses One-Hot-Encoding to encode the categorical variables
    
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    to_drop : list
        The list of the categorical variables on which we want to apply OHE
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    output_data = input_data.copy()

    #Apply One-Hot-Encoding to each one of the categorical variable
    for col in to_drop:
        ohe = OneHotEncoder(sparse=False, dtype=int)
        sub_df = pd.DataFrame(ohe.fit_transform(input_data[[col]]), columns=ohe.categories_[0])
        output_data = pd.concat((output_data, sub_df), axis=1)
    #Drop the columns for which we used OHE
    output_data.drop(columns = to_drop, inplace=True)
    
    return output_data

In [13]:
# Perform one-hot encoding on three columns:
data = clean_categorical(data)

In [14]:
# Check the shape of the dataframe after all pre-processing
data.head()

Unnamed: 0,external_id,month,year,B20004e10,B11016e1,B12001e12,B20004e11,B19125e1,B12001e13,B23008e22,...,monthly_Monday,HI,LI,MI,HD,LD,MD,A,C,D
0,1900203,3,2019,51111,1868,688,0,78934,1342,0,...,0.0,1,0,0,1,0,0,0,1,0
1,1900203,6,2018,51111,1868,688,0,78934,1342,0,...,0.0,1,0,0,1,0,0,0,1,0
2,1900203,8,2018,51111,1868,688,0,78934,1342,0,...,0.0,1,0,0,1,0,0,0,1,0
3,MR00101775,1,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0,1,0,0,1,0,0,1,0
4,MR00101775,8,2019,45484,2613,980,30417,45578,1097,66,...,0.0,0,1,0,0,1,0,0,1,0


Create `X` and `y`:

In [15]:
X = data.drop('unacast_session_count', axis=1)
y = data.loc[:, 'unacast_session_count']

In [16]:
# For now, drop `external_id` and `state`
X = X.drop(['external_id', 'state'], axis=1)

In [17]:
# Check if there are missing values in X
X.isna().sum().sort_values(ascending=False)

days_since_first_sess    17800
D                            0
B12001e2                     0
B11005e9                     0
B13016e5                     0
                         ...  
precip_mm_10_above           0
precip_mm_1_10               0
precip_mm_0_1                0
precip_mm_none               0
month                        0
Length: 818, dtype: int64

In [18]:
# For now, fill `days_since_first_sess` with 0
X['days_since_first_sess'] = X['days_since_first_sess'].fillna(0)

In [19]:
# Check if there are missing values in y
y.isna().sum()

0

No `NaN` values in `y` - that's good.

Split the data into training and validation sets:

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.2,
                                                      random_state=2020)

In [21]:
X_train.shape

(40096, 818)

In [22]:
X_valid.shape

(10024, 818)

Fit a `GradientBoostingRegressor` with default settings:

In [151]:
params = {'verbose': 1,
          'random_state': 2020}

In [152]:
gbr = GradientBoostingRegressor(**params)
gbr.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1      213669.2304            4.86m
         2      205282.8246            4.79m
         3      198390.9645            4.72m
         4      191658.6878            5.01m
         5      186589.0004            4.92m
         6      181186.5017            4.90m
         7      177208.5001            4.84m
         8      174242.0719            5.04m
         9      171638.3291            5.22m
        10      165022.6046            5.32m
        20      129466.3909            4.65m
        30       99860.7850            3.98m
        40       81710.4135            3.35m
        50       70320.0408            2.81m
        60       62436.7513            2.22m
        70       58201.8945            1.67m
        80       53252.5016            1.11m
        90       49244.3230           33.05s
       100       46957.9923            0.00s


In [170]:
# Calculate MSE
y_pred = gbr.predict(X_valid)
mean_squared_error(y_valid, y_pred)

370204.9588049915

In [153]:
# Calculate R^2 of the prediction
gbr.score(X_valid, y_valid)

0.27425744561888155

Create another `GradientBoostingRegressor` where `n_estimators` is doubled.

In [154]:
params_1000 = {'verbose': 1,
               'n_estimators': 1000,
               'random_state': 2020}

In [155]:
gbr_1000 = GradientBoostingRegressor(**params_1000)

t0 = time.time()
gbr_1000.fit(X_train, y_train)
t1 = time.time()
fit_time = t1 - t0

      Iter       Train Loss   Remaining Time 
         1      213669.2304           48.80m
         2      205282.8246           47.63m
         3      198390.9645           47.96m
         4      191658.6878           49.80m
         5      186589.0004           53.81m
         6      181186.5017           53.67m
         7      177208.5001           53.37m
         8      174242.0719           53.11m
         9      171638.3291           52.80m
        10      165022.6046           52.90m
        20      129466.3909           50.69m
        30       99860.7850           51.99m
        40       81710.4135           51.96m
        50       70320.0408           51.28m
        60       62436.7513           53.76m
        70       58201.8945           53.96m
        80       53252.5016           52.57m
        90       49244.3230           50.99m
       100       46957.9923           49.76m
       200       33108.7092           43.56m
       300       26537.4012           37.29m
       40

In [171]:
# Calculate MSE
y_pred_1000 = gbr_1000.predict(X_valid)
mean_squared_error(y_valid, y_pred_1000)

364149.91425749223

In [162]:
# Calculate R^2 of the prediction
gbr_1000.score(X_valid, y_valid)

0.2861276364206973

Perform randomized search of optimal hyperparameters:

In [23]:
param_grid = {'min_samples_split': [2, 4, 6],
              'max_depth': [3, 5, 7, 9],
              'max_features': ['auto', 'sqrt']}

In [25]:
gbr_gs = GradientBoostingRegressor(n_estimators=1000, verbose=1, random_state=2020)

rscv = RandomizedSearchCV(gbr_gs, param_grid, n_iter=3, verbose=2, cv=3, n_jobs=1, random_state=2020)

search = rscv.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] min_samples_split=4, max_features=auto, max_depth=3 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


      Iter       Train Loss   Remaining Time 
         1      153317.5171           65.14m
         2      142882.3344           66.35m
         3      135685.8230           66.56m
         4      128130.1997           66.66m
         5      122470.8207           66.94m
         6      118451.6037           66.95m
         7      115259.8875           67.29m
         8      112609.5008           67.27m
         9      109552.1000           67.53m
        10      107623.1704           68.16m
        20       89430.5067           67.99m
        30       71334.1225           69.04m
        40       60044.3450           69.81m
        50       52779.6393           69.76m
        60       47767.4358           69.39m
        70       43246.5435           68.77m
        80       40850.4885           68.26m
        90       39042.1967           67.94m
       100       37326.2071           67.25m
       200       24353.1053           53.62m
       300       17797.7181           38.50m
       40

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 37.7min remaining:    0.0s


      Iter       Train Loss   Remaining Time 
         1      241967.4629           30.65m
         2      225379.2903           30.43m
         3      208913.0207           30.43m
         4      201653.9873           30.33m
         5      188521.9523           30.30m
         6      177523.3656           30.27m
         7      166659.6284           30.17m
         8      158616.3034           30.14m
         9      150653.1653           30.08m
        10      143361.2094           30.05m
        20      107967.1941           29.69m
        30       81686.2252           29.74m
        40       65468.1422           29.55m
        50       54354.3595           29.37m
        60       48476.4330           29.05m
        70       45189.1124           28.73m
        80       42161.0303           28.38m
        90       39027.2504           28.08m
       100       36137.7814           27.76m
       200       25283.7676           24.63m
       300       19759.4666           21.70m
       40

       300        1952.7888            1.80m
       400        1389.6867            1.54m
       500        1055.7018            1.29m
       600         842.7764            1.03m
       700         665.2847           46.73s
       800         535.4185           31.21s
       900         448.4783           15.64s
      1000         372.9179            0.00s
[CV]  min_samples_split=2, max_features=sqrt, max_depth=7, total= 2.6min
[CV] min_samples_split=2, max_features=sqrt, max_depth=7 .............
      Iter       Train Loss   Remaining Time 
         1      231119.0006            2.55m
         2      206885.3001            2.52m
         3      186561.3483            2.49m
         4      162252.3419            2.50m
         5      141641.5785            2.48m
         6      126785.6327            2.48m
         7      113329.5210            2.46m
         8      101487.7961            2.46m
         9       89886.4764            2.46m
        10       80677.5757            2.45m


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 365.4min finished


      Iter       Train Loss   Remaining Time 
         1      201432.3232            3.71m
         2      187698.6130            3.69m
         3      172924.2830            3.59m
         4      165930.4164            3.59m
         5      152832.8956            3.57m
         6      137973.6947            3.56m
         7      129007.9530            3.54m
         8      119984.4184            3.54m
         9      110339.4638            3.54m
        10      100848.2244            3.51m
        20       54586.3249            3.48m
        30       35933.6346            3.48m
        40       25765.4375            3.47m
        50       19896.6681            3.44m
        60       15900.4821            3.40m
        70       13601.7552            3.39m
        80       11404.3946            3.36m
        90       10302.6814            3.33m
       100        9233.4724            3.29m
       200        4295.5839            2.94m
       300        2765.3633            2.57m
       40

In [31]:
# Print the most optimal hyperparameter settings
search.best_params_

{'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 7}

In [32]:
# Print the evaluation of the hyperparameter candidates
result = search.cv_results_
pd.DataFrame(result)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,2159.9192,73.135515,1.049887,0.263267,4,auto,3,"{'min_samples_split': 4, 'max_features': 'auto...",0.161272,0.364089,0.055305,0.193555,0.128111,3
1,4987.493293,430.139624,1.645919,0.316282,4,auto,7,"{'min_samples_split': 4, 'max_features': 'auto...",0.190472,0.284677,0.402996,0.292715,0.086948,2
2,157.151442,0.092279,1.50586,0.004716,2,sqrt,7,"{'min_samples_split': 2, 'max_features': 'sqrt...",0.128976,0.545482,0.384348,0.352935,0.171483,1


In [35]:
# Call predict on the most optimal hyperparameter on the training set
# Calculate MSE
mean_squared_error(y_train, search.predict(X_train))

651.2234780126952

In [36]:
# Calculate R^2 of the prediction on the training set
search.score(X_train, y_train)

0.9971445104080854

In [37]:
# Call predict on the most optimal hyperparameter on the validation set
# Calculate MSE
mean_squared_error(y_valid, search.predict(X_valid))

394161.808067091

In [39]:
# Calculate R^2 of the prediction on the validation set
search.score(X_valid, y_valid)

0.2272929072871356

Try removing the playgrounds with over 100,000 lifetime sessions.

References:
- https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html
- https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
- http://www.chengli.io/tutorials/gradient_boosting.pdf