In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import QuantileRegressor
from sklearn.metrics import mean_pinball_loss, make_scorer
from scipy.stats import loguniform
from sklearn.model_selection import GridSearchCV
import numpy as np

## Read Train and Test Data 

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Check for Missing values and replace with the most frequent value

In [3]:
train.isnull().sum()

id                 0
wi                 0
year               0
month              0
age                0
education          0
familysize         0
urban              0
race               0
region          6994
state          95742
marital            0
occupation    233483
income             0
expense            0
dtype: int64

In [4]:
train = train.fillna(train.mode().iloc[0].astype('int64'))
train.isnull().sum()

id            0
wi            0
year          0
month         0
age           0
education     0
familysize    0
urban         0
race          0
region        0
state         0
marital       0
occupation    0
income        0
expense       0
dtype: int64

In [5]:
test.isnull().sum()

id                0
year              0
month             0
age               0
education         0
familysize        0
urban             0
race              0
region         1826
state         24016
marital           0
occupation    58171
income            0
dtype: int64

In [6]:
test = test.fillna(test.mode().iloc[0].astype('int64'))
test.isnull().sum()

id            0
year          0
month         0
age           0
education     0
familysize    0
urban         0
race          0
region        0
state         0
marital       0
occupation    0
income        0
dtype: int64

## Identify categorical, ordinal and numeric columns

In [7]:
columns = train.columns
columns  = columns.to_list()
categorical_columns=['urban','race','region','state','marital','occupation']
ordinal_columns=['education']
numeric_columns=['age','familysize','income']
weight_column=['wi']
target=['expense']
id_column=['id']
columns

['id',
 'wi',
 'year',
 'month',
 'age',
 'education',
 'familysize',
 'urban',
 'race',
 'region',
 'state',
 'marital',
 'occupation',
 'income',
 'expense']

## Convert types of various columns in train and test data

In [8]:
#Convert cateogrical columns to 'category' type
cat_types={c:'category' for c in categorical_columns}
train = train.astype(cat_types)
#Convert ordinal column to category type
train['education'] = train['education'].astype('category')
#Get different categories for ordinal column
train_ord_categories = train.education.unique().tolist()
train_ord_categories.sort()
train['education'] = train['education'].cat.set_categories(train_ord_categories,ordered=True)

In [9]:
#Convert cateogrical columns to 'category' type
cat_types={c:'category' for c in categorical_columns}
test = test.astype(cat_types)
#Convert ordinal column to category type
test['education'] = test['education'].astype('category')
#Get different categories for ordinal column
test_ord_categories = test.education.unique().tolist()
test_ord_categories.sort()
test['education'] = test['education'].cat.set_categories(test_ord_categories,ordered=True)
test.dtypes

id               int64
year             int64
month            int64
age              int64
education     category
familysize       int64
urban         category
race          category
region        category
state         category
marital       category
occupation    category
income           int64
dtype: object

## Create Train and Validation

In [10]:
from sklearn.model_selection import train_test_split
a_train,a_validation = train_test_split(train,test_size=0.2)
train = pd.DataFrame(a_train,columns=columns)
validation = pd.DataFrame(a_validation,columns=columns)

## Create X_train, y_train, X_val, y_val

In [11]:
X_train = train[categorical_columns+ordinal_columns+numeric_columns]
y_train = train[target]
sample_weights_train=train[weight_column].to_numpy()
sample_weights_train=sample_weights_train.reshape(-1)
X_val = validation[categorical_columns+ordinal_columns+numeric_columns]
y_val = validation[target]
sample_weights_validation=validation[weight_column].to_numpy()
sample_weights_validation = sample_weights_validation.reshape(-1)

## Create X_test for final predictions

In [12]:
X_test = test[categorical_columns+ordinal_columns+numeric_columns]

## Create preprocessor and column transformer for categorical,numerical and ordinal columns

In [13]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()
ordinal_preprocessor = OrdinalEncoder(dtype=int)
preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numeric_columns),
        ("ordinal-encoder", ordinal_preprocessor,ordinal_columns),
    ]
)

In [14]:
quantiles = [0.005,0.025,0.165,0.25,0.5,0.75,0.835,0.975,0.995]
estimators={}
scorers = {}
for index,q in enumerate(quantiles):
    estimators["q"+str(index+1)] = Pipeline([('preprocessor',preprocessor),('model',QuantileRegressor(quantile=q,solver="highs"))])
    scorers["q"+str(index+1)] = make_scorer(mean_pinball_loss,alpha=q)

## Create Hyperparameters for search

In [15]:
hyperparameters = {"model__alpha":np.arange(0.0,1,0.1)}

In [16]:
search_cv = {}
for q in estimators.keys():
    search_cv[q] = GridSearchCV(estimator=estimators[q],
                                param_grid=hyperparameters,
                                scoring=scorers[q],
                                n_jobs=1,
                                verbose=10)

In [None]:
fit_params = {"model__sample_weight":sample_weights_train}
for q in search_cv.keys():
    print(f"Searching best Hyperparameters for {q}")
    search_cv[q].fit(X_train,y_train.values.ravel(),**fit_params)

Searching best Hyperparameters for q1
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START model__alpha=0.0...........................................


## Predict on entire training set and calculate mean_pinball_loss_train for all quantiles

In [None]:
mpl_train={}
for i,q in enumerate(search_cv.keys()):
    y_pred=search_cv[q].predict(X_train)
    mpl_train[q]=mean_pinball_loss(y_train.values.ravel(),y_pred,sample_weight=sample_weights_train,alpha=quantiles[i])
mpl_train

## Predict on validation set and calculate mean_pinball_loss for all quantiles

In [None]:
mpl_val={}
for i,q in enumerate(search_cv.keys()):
    y_pred = search_cv[q].predict(X_val)
    mpl_val[q]=mean_pinball_loss(y_val.values.ravel(),y_pred,sample_weight=sample_weights_validation,alpha=quantiles[i])
mpl_val

## Predict on test set and create output data frame

In [None]:
test_df = test[id_column]
for q in search_cv.keys():
    test_df[q] = search_cv[q].predict(X_test)

## write output to test_quantiles_QuantileReg.csv

In [None]:
test_df.to_csv('test_quantiles_QuantileReg.csv')