In [1]:
import numpy as np
import pandas as pd
import gc

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_squared_log_error as MSLE

from catboost import CatBoostRegressor, Pool

In [2]:
def show_me_errors(y_true, y_preds):
    mae = MAE(y_true, y_preds)
    rmse = np.sqrt(MSE(y_true, y_preds))
    rmsle = np.sqrt(MSLE(y_true, y_preds))
    
    print("RMSE:  {}".format(np.around(rmse, 4)))
    print("RMSLE: {}".format(np.around(rmsle, 4)))
    print("MAE:   {}".format(np.around(mae, 4)))

In [3]:
def create_test_pool(X, cat_features):
    
    """Returns Catboost Pool with categorical encoding."""
    #X_to_encode = X_train[categorical_features].replace(np.nan, 'NA', regex=True).astype('str')
    X_to_encode = X[cat_features].astype('str')

    X_no_encoding_reqd = X[list(set(X) - set(cat_features))]

    X_encoded = pd.merge(X_no_encoding_reqd, X_to_encode, left_index=True, right_index=True)

    return Pool(X_encoded, cat_features=cat_features)
    

# Site Predictions

In [None]:
cat_features = {'building_id', 'meter', 'primary_use',
                'air_temperature_was_missing',
                'cloud_coverage_was_missing', 'dew_temperature_was_missing',
                'precip_depth_1_hr_was_missing', 'sea_level_pressure_was_missing',
                'wind_direction_was_missing', 'wind_speed_was_missing',
                'day_of_month', 'day_of_week'}

In [4]:
y = pd.DataFrame(columns=["RowID", "y_true", "y_pred_sites"])
y

Unnamed: 0,RowID,y_true,y_pred_sites


In [5]:
cat_features_sites = {'building_id', 'meter', 'primary_use',
                'air_temperature_was_missing',
                'cloud_coverage_was_missing', 'dew_temperature_was_missing',
                'precip_depth_1_hr_was_missing', 'sea_level_pressure_was_missing',
                'wind_direction_was_missing', 'wind_speed_was_missing',
                'day_of_month', 'day_of_week'}

for site in range(16):

    # 1. Set model and test file names
    model_file_name = "/data/site_id/final_models/model_site_{}".format(site)
    #test_file_name = "/data/site_id/holdout/holdout_site_id_{}.csv".format(site)
    test_file_name = "/data/site_id/test/test_site_id_{}.csv".format(site)

    # 2. Read the test file
    test = pd.read_csv(test_file_name)
    
    RowID = test["Unnamed: 0"]
    y_test = test.meter_reading
    X_test = test.drop("meter_reading", axis=1)
    del test
    gc.collect()

    # 3. Identify features / categorical features
    file_cols = set(X_test.columns)
    selected_cat_features = cat_features_sites.intersection(file_cols)

    # 4. Create CatBoost Pool
    X_pool = create_test_pool(X_test, cat_features = selected_cat_features)

    # 5. Read the model file
    model = CatBoostRegressor()
    model.load_model(model_file_name)

    # Make Prediction
    print("Predicting for site {}".format(site))
    raw_preds = model.predict(X_pool)

    # expm1 Transform the preds 
    preds = np.expm1(raw_preds)
    
    # store y_test and preds in their own rows
    df = pd.DataFrame({"RowID":RowID,
                       "y_true":y_test, 
                       "y_pred_sites":preds})
    
    y = y.append(df)
    
    del df
    gc.collect()

Predicting for site 0
Predicting for site 1


  interactivity=interactivity, compiler=compiler, result=result)


Predicting for site 2
Predicting for site 3
Predicting for site 4
Predicting for site 5
Predicting for site 6
Predicting for site 7
Predicting for site 8
Predicting for site 9
Predicting for site 10
Predicting for site 11
Predicting for site 12
Predicting for site 13
Predicting for site 14
Predicting for site 15


In [6]:
y.to_csv("pred_results/preds_site_test.csv", index=False)

In [7]:
y.shape

(3972960, 3)

# Meter Predictions 

In [11]:
y_meter = pd.DataFrame(columns=["RowID", "y_true", "y_pred_meter"])
y_meter

Unnamed: 0,RowID,y_true,y_pred_meter


In [6]:
cat_features_meter = {'building_id', "site_id", 'primary_use',
                'air_temperature_was_missing',
                'cloud_coverage_was_missing', 'dew_temperature_was_missing',
                'precip_depth_1_hr_was_missing', 'sea_level_pressure_was_missing',
                'wind_direction_was_missing', 'wind_speed_was_missing',
                'day_of_month', 'day_of_week'}

In [5]:
cat_features_meter = {'building_id', 'primary_use',
                'air_temperature_was_missing',
                'cloud_coverage_was_missing', 'dew_temperature_was_missing',
                'precip_depth_1_hr_was_missing', 'sea_level_pressure_was_missing',
                'wind_direction_was_missing', 'wind_speed_was_missing',
                'day_of_month', 'day_of_week'}

In [12]:

for meter in [0, 1, 2, 3]:

    # 1. Set model and test file names
    model_file_name = "/data/meter_type/final_models/model_meter_{}".format(meter)
    test_file_name = "/data/meter_type/holdout/holdout_meter_{}.csv".format(meter)

    # 2. Read the test file
    test = pd.read_csv(test_file_name)
    
    RowID = test["Unnamed: 0"]
    y_test = test.meter_reading
    X_test = test.drop("meter_reading", axis=1)
    del test
    gc.collect()

    # 3. Identify features / categorical features
    file_cols = set(X_test.columns)
    selected_cat_features = cat_features_meter.intersection(file_cols)

    # 4. Create CatBoost Pool
    X_pool = create_test_pool(X_test, cat_features = selected_cat_features)

    # 5. Read the model file
    model = CatBoostRegressor()
    model.load_model(model_file_name)

    # Make Prediction
    print("Predicting meter {}".format(meter))
    raw_preds = model.predict(X_pool)

    # expm1 Transform the preds 
    preds = np.expm1(raw_preds)
    
    # store y_test and preds in their own rows
    df = pd.DataFrame({"RowID":RowID,
                       "y_true":y_test,
                       "y_pred_meter":preds})
    
    y_meter = y_meter.append(df)
    
    del df
    gc.collect()    

Predicting meter 0
Predicting meter 1
Predicting meter 2
Predicting meter 3


In [None]:
y_meter.head()

In [14]:
y_meter.shape

(3972960, 3)

In [13]:
y_meter.to_csv("pred_results/preds_meter_test.csv", index=False)

# Baseline Prediction 

In [13]:
train_df = pd.read_csv("/data/preprocessed/train.csv")
    
y_train = train_df["meter_reading"]
baseline_pred = np.around(np.mean(y_train), 4)

print(baseline_pred)

468.2848


In [14]:
del train_df, y_train
gc.collect()

5334

# Final Combination: Averaging 

In [10]:
pred_site = pd.read_csv("pred_results/preds_site.csv")
pred_meter = pd.read_csv("pred_results/preds_meter.csv")

y_preds = pd.merge(pred_site, pred_meter, on=["RowID", "y_true"])

# Add baseline prediction column
y_preds["y_pred_baseline"] = np.full(shape=y_preds.shape[0], fill_value=baseline_pred)

# Clip prediction values so that the lower value is 0
y_preds["y_pred_sites"] = np.clip(y_preds.y_pred_sites, a_min=0, a_max=np.max(y_preds.y_pred_sites))
y_preds["y_pred_meter"] = np.clip(y_preds.y_pred_meter, a_min=0, a_max=np.max(y_preds.y_pred_meter))

# add the average model prediction
y_pred_average = y_preds[["y_pred_sites", "y_pred_meter"]].apply(np.mean, axis=1)
y_preds["y_pred_average"] = y_pred_average
y_preds.sample(5)

In [21]:
# rearrange the columns before saving
y_preds = y_preds[['RowID', 'y_true', 'y_pred_baseline', 'y_pred_sites', 'y_pred_meter', 'y_pred_average']].sort_values(by="RowID")

y_preds.to_csv("pred_results/all_preds.csv", index=False)

# Error Metrics 

## Train, Validation, Test Set Errors

### Train

In [16]:
train_meter = pd.read_csv("pred_results/preds_meter_train.csv")
train_site = pd.read_csv("pred_results/preds_site_train.csv")

# Merge the two dfs 
y_preds = pd.merge(train_meter, train_site, on=["RowID", "y_true"])

# Clip prediction values so that the lower value is 0
y_preds["y_pred_sites"] = np.clip(y_preds.y_pred_sites, a_min=0, a_max=np.max(y_preds.y_pred_sites))
y_preds["y_pred_meter"] = np.clip(y_preds.y_pred_meter, a_min=0, a_max=np.max(y_preds.y_pred_meter))

# add the average model prediction
y_pred_average = y_preds[["y_pred_sites", "y_pred_meter"]].apply(np.mean, axis=1)
y_preds["y_pred_average"] = y_pred_average
y_preds.sample(5)

Unnamed: 0,RowID,y_true,y_pred_meter,y_pred_sites
0,0,1579.22,496.958014,549.675422
1,1,43.69,24.880304,47.551085
2,2,8.9927,9.459577,14.32339
3,3,109.0,197.550625,55.044439
4,5,11.65,10.521703,9.050824


In [17]:
# Clip prediction values so that the lower value is 0
y_preds["y_pred_sites"] = np.clip(y_preds.y_pred_sites, a_min=0, a_max=np.max(y_preds.y_pred_sites))
y_preds["y_pred_meter"] = np.clip(y_preds.y_pred_meter, a_min=0, a_max=np.max(y_preds.y_pred_meter))

# add the average model prediction
y_pred_average = y_preds[["y_pred_sites", "y_pred_meter"]].apply(np.mean, axis=1)
y_preds["y_pred_average"] = y_pred_average
y_preds.sample(5)

Unnamed: 0,RowID,y_true,y_pred_meter,y_pred_sites,y_pred_average
11160986,1854,1358.76,226.328993,325.739441,276.034217
6144942,10420998,5.9,10.849287,10.714578,10.781932
4796646,8135067,146.48,78.594047,71.484202,75.039124
872919,1479018,202.0,154.98738,200.278166,177.632773
11461040,4732532,0.0,3.070102,3.170422,3.120262


In [19]:
show_me_errors(y_preds.y_true, y_preds.y_pred_average)

RMSE:  4127.8104
RMSLE: 1.1378
MAE:   316.7129


### Validation  

In [20]:
test_meter = pd.read_csv("pred_results/preds_meter_test.csv")
test_site = pd.read_csv("pred_results/preds_site_test.csv")

# Merge the two dfs 
y_preds = pd.merge(test_meter, test_site, on=["RowID", "y_true"])

# Clip prediction values so that the lower value is 0
y_preds["y_pred_sites"] = np.clip(y_preds.y_pred_sites, a_min=0, a_max=np.max(y_preds.y_pred_sites))
y_preds["y_pred_meter"] = np.clip(y_preds.y_pred_meter, a_min=0, a_max=np.max(y_preds.y_pred_meter))

# add the average model prediction
y_pred_average = y_preds[["y_pred_sites", "y_pred_meter"]].apply(np.mean, axis=1)
y_preds["y_pred_average"] = y_pred_average
y_preds.sample(5)

Unnamed: 0,RowID,y_true,y_pred_meter,y_pred_sites,y_pred_average
167953,284803,9.96,8.766757,7.628923,8.19784
1123488,1905000,1216.39,569.149101,461.087515,515.118308
629889,1068402,247.0,179.060372,173.891121,176.475746
44429,75318,13.64,5.482703,7.779976,6.63134
1270553,2154317,20.142,16.4019,24.264638,20.333269


In [21]:
show_me_errors(y_preds.y_true, y_preds.y_pred_average)

RMSE:  4485.6728
RMSLE: 1.13
MAE:   311.352


### Test (holdout)  

## Different Models

In [22]:
preds = pd.read_csv("pred_results/all_preds.csv")
preds.sample(5)

Unnamed: 0,RowID,y_true,y_pred_baseline,y_pred_sites,y_pred_meter,y_pred_average
1622327,1622327,116.97,468.2848,190.712234,181.927439,186.319836
1146578,1146578,30.67,468.2848,24.537648,18.064901,21.301274
1933730,1933730,0.0,468.2848,10.079644,29.107347,19.593496
1618315,1618315,389.808,468.2848,212.79343,161.312495,187.052962
2406164,2406164,5938.11,468.2848,2349.170036,1227.020929,1788.095482


In [23]:
pred_site = preds.y_pred_sites
pred_meter = preds.y_pred_meter
pred_combined = preds.y_pred_average
pred_baseline = preds.y_pred_baseline

true = preds.y_true

### Baseline 

In [34]:
show_me_errors(y_true=true, y_preds=pred_baseline)

RMSE:  4145.9845
RMSLE: 2.8482
MAE:   615.5186


### Site 

In [35]:
show_me_errors(y_true=true , y_preds=pred_site)

RMSE:  4049.4
RMSLE: 1.1344
MAE:   303.5662


### Meter 

In [36]:
show_me_errors(y_true=true , y_preds=pred_meter)

RMSE:  4060.5841
RMSLE: 1.1875
MAE:   319.4877


### Combined 

In [37]:
show_me_errors(y_true=true , y_preds=pred_combined)

RMSE:  4053.6748
RMSLE: 1.1313
MAE:   305.8833
