In [39]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_pinball_loss, make_scorer
from scipy.stats import loguniform
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import pickle

## Read Train and Test Data 

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Check for Missing values and replace with the most frequent value

In [3]:
train.isnull().sum()

id                 0
wi                 0
year               0
month              0
age                0
education          0
familysize         0
urban              0
race               0
region          6994
state          95742
marital            0
occupation    233483
income             0
expense            0
dtype: int64

In [4]:
train = train.fillna(train.mode().iloc[0].astype('int64'))
train.isnull().sum()

id            0
wi            0
year          0
month         0
age           0
education     0
familysize    0
urban         0
race          0
region        0
state         0
marital       0
occupation    0
income        0
expense       0
dtype: int64

In [5]:
test.isnull().sum()

id                0
year              0
month             0
age               0
education         0
familysize        0
urban             0
race              0
region         1826
state         24016
marital           0
occupation    58171
income            0
dtype: int64

In [6]:
test = test.fillna(test.mode().iloc[0].astype('int64'))
test.isnull().sum()

id            0
year          0
month         0
age           0
education     0
familysize    0
urban         0
race          0
region        0
state         0
marital       0
occupation    0
income        0
dtype: int64

## Identify categorical, ordinal and numeric columns

In [7]:
columns = train.columns
columns  = columns.to_list()
categorical_columns=['urban','race','region','state','marital','occupation']
ordinal_columns=['education']
numeric_columns=['age','familysize','income']
weight_column=['wi']
target=['expense']
id_column=['id']
columns

['id',
 'wi',
 'year',
 'month',
 'age',
 'education',
 'familysize',
 'urban',
 'race',
 'region',
 'state',
 'marital',
 'occupation',
 'income',
 'expense']

## Convert types of various columns in train and test data

In [8]:
#Convert cateogrical columns to 'category' type
cat_types={c:'category' for c in categorical_columns}
train = train.astype(cat_types)
#Convert ordinal column to category type
train['education'] = train['education'].astype('category')
#Get different categories for ordinal column
train_ord_categories = train.education.unique().tolist()
train_ord_categories.sort()
train['education'] = train['education'].cat.set_categories(train_ord_categories,ordered=True)

In [9]:
#Convert cateogrical columns to 'category' type
cat_types={c:'category' for c in categorical_columns}
test = test.astype(cat_types)
#Convert ordinal column to category type
test['education'] = test['education'].astype('category')
#Get different categories for ordinal column
test_ord_categories = test.education.unique().tolist()
test_ord_categories.sort()
test['education'] = test['education'].cat.set_categories(test_ord_categories,ordered=True)
test.dtypes

id               int64
year             int64
month            int64
age              int64
education     category
familysize       int64
urban         category
race          category
region        category
state         category
marital       category
occupation    category
income           int64
dtype: object

## Create Train and Validation

In [10]:
from sklearn.model_selection import train_test_split
a_train,a_validation = train_test_split(train,test_size=0.2)
train = pd.DataFrame(a_train,columns=columns)
validation = pd.DataFrame(a_validation,columns=columns)

## Create X_train, y_train, X_val, y_val

In [11]:
X_train = train[categorical_columns+ordinal_columns+numeric_columns]
y_train = train[target]
sample_weights_train=train[weight_column].to_numpy()
sample_weights_train=sample_weights_train.reshape(-1)
X_val = validation[categorical_columns+ordinal_columns+numeric_columns]
y_val = validation[target]
sample_weights_validation=validation[weight_column].to_numpy()
sample_weights_validation = sample_weights_validation.reshape(-1)

## Create X_test for final predictions

In [12]:
X_test = test[categorical_columns+ordinal_columns+numeric_columns]

## Create preprocessor and column transformer for categorical,numerical and ordinal columns

In [13]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()
ordinal_preprocessor = OrdinalEncoder(dtype=int)
preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("ordinal-encoder", ordinal_preprocessor,ordinal_columns),
    ]
)

## Create estimators and scorers for each quantile

In [14]:
quantiles = [0.005,0.025,0.165,0.25,0.5,0.75,0.835,0.975,0.995]
estimators={}
scorers = {}
for index,q in enumerate(quantiles):
    estimators["q"+str(index+1)] = Pipeline([('preprocessor',preprocessor),('model',GradientBoostingRegressor(loss="quantile",alpha=q,random_state=42,verbose=2))])
    scorers["q"+str(index+1)] = make_scorer(mean_pinball_loss,alpha=q,greater_is_better=False)

## Create Hyperparameters for search

In [15]:
hyperparameters = {"model__learning_rate":loguniform(0.001,1).rvs(size=10)}

## Create search_cv for each quantile

In [16]:
search_cv = {}
for q in estimators.keys():
    search_cv[q] = GridSearchCV(estimator=estimators[q],
                                param_grid=hyperparameters,
                                scoring=scorers[q],
                                n_jobs=8,
                                verbose=10)

In [17]:
fit_params = {"model__sample_weight":sample_weights_train}
for q in search_cv.keys():
    print(f"Searching best Hyperparameters for {q}")
    search_cv[q].fit(X_train,y_train.values.ravel(),**fit_params)

Searching best Hyperparameters for q1
Fitting 5 folds for each of 10 candidates, totalling 50 fits
      Iter       Train Loss   Remaining Time 
         1          20.9298            1.39m
         2          20.9298            1.33m
         3          20.9298            1.25m
         4          20.9298            1.22m
         5          20.9298            1.18m
         6          20.9298            1.17m
         7          20.9298            1.16m
         8          20.9298            1.16m
         9          20.9298            1.16m
        10          20.9298            1.14m
        11          20.9298            1.15m
        12          20.9298            1.13m
        13          20.9298            1.11m
        14          20.9298            1.09m
        15          20.9298            1.08m
        16          20.9298            1.07m
        17          20.9298            1.05m
        18          20.9298            1.03m
        19          20.9298            1.02m


        77          78.0531           17.54s
        78          78.0531           16.79s
        79          78.0531           16.03s
        80          78.0531           15.28s
        81          78.0531           14.54s
        82          78.0531           13.80s
        83          78.0531           13.09s
        84          78.0531           12.39s
        85          78.0531           11.68s
        86          78.0531           10.95s
        87          78.0531           10.21s
        88          78.0531            9.48s
        89          78.0531            8.74s
        90          78.0531            7.98s
        91          78.0531            7.21s
        92          78.0531            6.44s
        93          78.0531            5.66s
        94          78.0531            4.87s
        95          78.0531            4.07s
        96          78.0531            3.27s
        97          78.0531            2.46s
        98          78.0531            1.65s
        99

        53         510.0076           35.84s
        54         509.8611           35.20s
        55         509.8132           34.50s
        56         509.7918           33.86s
        57         509.7658           33.11s
        58         509.7362           32.38s
        59         509.7191           31.67s
        60         509.6701           30.87s
        61         509.6636           30.15s
        62         509.6335           29.48s
        63         509.6184           28.76s
        64         509.6090           27.92s
        65         509.5827           27.13s
        66         509.5712           26.37s
        67         509.5481           25.61s
        68         509.5371           24.88s
        69         509.4972           24.13s
        70         509.4180           23.34s
        71         509.3759           22.56s
        72         509.3652           21.73s
        73         509.3355           20.94s
        74         509.3227           20.14s
        75

        29         829.1062           58.53s
        30         828.9484           57.77s
        31         828.8368           56.89s
        32         828.7117           56.06s
        33         828.6023           55.21s
        34         828.5398           54.33s
        35         828.4051           53.56s
        36         828.3243           52.81s
        37         828.2647           51.89s
        38         828.1322           50.97s
        39         828.0462           50.12s
        40         827.9896           49.29s
        41         827.7840           48.39s
        42         827.6833           47.54s
        43         827.5856           46.90s
        44         827.5142           46.05s
        45         827.4126           45.20s
        46         827.3761           44.55s
        47         827.3385           43.81s
        48         827.2619           42.91s
        49         827.2290           42.20s
        50         827.1930           41.27s
        51

         5         387.5727            1.38m
         6         385.3920            1.38m
         7         383.8750            1.37m
         8         382.5357            1.35m
         9         381.6574            1.32m
        10         380.7628            1.30m
        11         380.2105            1.27m
        12         379.6348            1.26m
        13         379.0619            1.24m
        14         378.6618            1.22m
        15         378.3516            1.20m
        16         378.1170            1.18m
        17         377.7497            1.16m
        18         377.5388            1.15m
        19         377.4034            1.13m
        20         377.1901            1.12m
        21         377.0383            1.10m
        22         376.8665            1.08m
        23         376.7898            1.06m
        24         376.6759            1.04m
        25         376.4847            1.03m
        26         376.3660            1.01m
        27

        84         163.6612            2.92m
        85         163.6573            2.71m
        86         163.6450            2.50m
        87         163.6419            2.30m
        88         163.6350            2.10m
        89         163.6319            1.90m
        90         163.6128            1.71m
        91         163.6018            1.53m
        92         163.5968            1.34m
        93         163.5716            1.16m
        94         163.5536           59.21s
        95         163.5319           48.86s
        96         163.5271           38.71s
        97         163.5022           28.76s
        98         163.4823           18.99s
        99         163.4780            9.41s
       100         163.4644            0.00s


## Predict on entire training set and calculate mean_pinball_loss_train for all quantiles

In [31]:
mpl_train={}
for i,q in enumerate(search_cv.keys()):
    y_pred=search_cv[q].predict(X_train)
    mpl_train[q]=mean_pinball_loss(y_train.values.ravel(),y_pred,sample_weight=sample_weights_train,alpha=quantiles[i])
mpl_train

{'q1': 20.92979401072278,
 'q2': 78.05309032083764,
 'q3': 373.3537340294139,
 'q4': 508.633947199487,
 'q5': 778.4402932578334,
 'q6': 824.3514201131849,
 'q7': 758.0374754982063,
 'q8': 373.48023079754245,
 'q9': 163.4644260464789}

## Predict on validation set and calculate mean_pinball_loss for all quantiles

In [19]:
mpl_val={}
for i,q in enumerate(search_cv.keys()):
    y_pred = search_cv[q].predict(X_val)
    mpl_val[q]=mean_pinball_loss(y_val.values.ravel(),y_pred,sample_weight=sample_weights_validation,alpha=quantiles[i])
mpl_val

{'q1': 20.675099707149602,
 'q2': 77.65826534528892,
 'q3': 371.4715509523964,
 'q4': 505.7313958445188,
 'q5': 772.2780109723352,
 'q6': 815.5244500458288,
 'q7': 749.2662670196337,
 'q8': 366.9791288575568,
 'q9': 160.49446550129156}

## write mean pinball losses to a file

In [38]:
mpls={}
mpls['Data'] = ['mpl_train','mpl_val']
for q in mpl_train.keys():
    mpls[q]=[mpl_train[q],mpl_val[q]]
df = pd.DataFrame(mpls)
df.to_csv("GBRTQuantileLoss_MeanPinballLoss.csv",index=False)

## Predict on test set and create output data frame

In [20]:
test_df = test[id_column]
for q in search_cv.keys():
    test_df[q] = search_cv[q].predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[q] = search_cv[q].predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[q] = search_cv[q].predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[q] = search_cv[q].predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [21]:
test_df

Unnamed: 0,id,q1,q2,q3,q4,q5,q6,q7,q8,q9
0,741875,0.0,0.000000,601.238235,805.506886,1374.088858,2221.848777,2631.538765,4896.297014,8795.021614
1,741876,0.0,0.000000,1764.285253,2012.918245,3051.838734,4711.923660,5784.636435,12880.121505,28743.523384
2,741877,0.0,0.000000,443.534995,696.864093,1349.048740,2342.687115,3118.363884,7491.719372,17398.386310
3,741878,0.0,167.527269,641.192159,790.636246,1175.736542,1986.878909,2501.043503,5892.878668,13605.899737
4,741879,0.0,0.000000,1109.958673,1345.992511,2166.427062,3130.679186,3616.274401,7320.619769,15775.075134
...,...,...,...,...,...,...,...,...,...,...
185464,927339,0.0,0.000000,1352.809243,1801.080360,2836.879092,4254.380864,4923.898906,11370.236226,26676.480706
185465,927340,0.0,139.202440,673.926598,881.471025,1416.199220,2067.371828,2650.497919,5290.428336,10899.373029
185466,927341,0.0,0.000000,1122.487605,1465.068938,2427.541224,3901.848468,4840.810456,11196.066565,23679.097172
185467,927342,0.0,0.000000,1548.057016,1891.135630,2695.133694,3984.849454,4820.355623,10801.855581,21401.855531


## write output to test_quantiles_GBRTQuantileLoss.csv

In [23]:
test_df.to_csv('test_quantiles_GBRTQuantileLoss.csv',index=False)