In [70]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, Dropout, GRU, Conv1D, MaxPooling1D, Input, concatenate
from keras.datasets import imdb
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [20]:
def rmse(y_true,y_pred):
    return np.sqrt(mse(y_true,y_pred))

In [6]:
train = pd.read_pickle('features.pkl')
predict = pd.read_csv('Train.csv')

In [7]:
feature_cols = ['sum','mean','elevation','poly_rainfall','poly_elevation',
                'elev_diff','rainfall_diff','water_count','water_dist']
features = train[feature_cols]
target = train['target']

In [11]:
# train-test split (80-20)
x_train, x_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

In [29]:
# Lasso regression
best_lasso_model = None
best_lasso_score = float('inf')
best_alpha = 0
for a in tqdm(range(0,51)):
    lasso_model = Lasso(alpha=a/10).fit(x_train, y_train)
    val_pred_lasso = lasso_model.predict(x_val)
    val_score_lasso = rmse(val_pred_lasso,y_val)
    if val_score_lasso < best_lasso_score:
        best_alpha = a
        best_lasso_score = val_score_lasso
        best_lasso_model = lasso_model
print(f'RMSE for lasso: {best_lasso_score}, alpha = {best_alpha}')

100%|██████████| 51/51 [00:00<00:00, 107.93it/s]

RMSE for lasso: 0.2183069118585445, alpha = 0





In [32]:
# Ridge regression
best_ridge_model = None
best_ridge_score = float('inf')
best_alpha = 1
for a in tqdm(range(1,51)):
    ridge_model = Ridge(alpha=a/10).fit(x_train,y_train)
    val_pred_ridge = ridge_model.predict(x_val)
    val_score_ridge = rmse(val_pred_ridge,y_val)
    if val_score_ridge < best_ridge_score:
        best_alpha = a
        best_ridge_score = val_score_ridge
        best_ridge_model = ridge_model
print(f'RMSE for ridge: {best_ridge_score}, alpha = {best_alpha}')

100%|██████████| 50/50 [00:00<00:00, 175.59it/s]

RMSE for ridge: 0.21830698469763316, alpha = 1





In [49]:
# RandomForest 
rf_model = RandomForestRegressor(n_estimators = 500,
                                min_samples_split = 2,
                                min_samples_leaf = 1,
                                max_samples = 0.8
                                )

# param_grid = [
#     { 'min_samples_split':[2,4,8,16],
#      'max_features': [2, 4, 6, 8, 9],
#      'max_samples': [0.6,0.7,0.8],
#      'min_impurity_decrease':[0.01,0.02,0.05]
#     }
# ]
# grid_search = GridSearchCV(rf_model, param_grid, cv=5,scoring='neg_root_mean_squared_error', n_jobs=-1)
# grid_search.fit(x_train, y_train)
# rf_model = grid_search.best_estimator_
rf_model.fit(x_train,y_train)
val_pred_rf = rf_model.predict(x_val)
val_score_rf = rmse(val_pred_rf,y_val)
print(f'RMSE for Random Forest: {val_score_rf}')

RMSE for Random Forest: 0.12262997851269453


In [69]:
# LGBM
lgb_params = {
        'boosting_type': 'gbdt',         
        'objective': 'regression',       
        'metric': ['rmse'],             
        'subsample': 0.5,                
        'subsample_freq': 1,
        'learning_rate': 0.05,           
        'num_leaves': 2**8,            
        'min_data_in_leaf': 2**4,      
        'feature_fraction': 0.5,
        'n_estimators': 5000,            
        'early_stopping_rounds': 30,     
        'verbose': -1,
            } 
train_set = lgb.Dataset(x_train, y_train)
val_set = lgb.Dataset(x_val, y_val)
lgb_model = lgb.train(lgb_params, train_set, num_boost_round = 2000, valid_sets = [train_set, val_set], verbose_eval = 100)

Training until validation scores don't improve for 30 rounds
[100]	training's rmse: 0.102463	valid_1's rmse: 0.118202
[200]	training's rmse: 0.0965204	valid_1's rmse: 0.116839
Early stopping, best iteration is:
[237]	training's rmse: 0.0951186	valid_1's rmse: 0.116717


In [134]:
# training configurations
from keras import backend as K

def keras_rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 
    
num_epochs = 100
batch_size = 4
num_nodes = 256
num_layers = 3
dropout = 0.2
loss_fn = keras_rmse
optimizer = 'adagrad'

In [135]:
# MLP
mlp_model = Sequential()
mlp_model.add(Dense(num_nodes, input_dim=x_train.shape[1], activation='sigmoid'))
mlp_model.add(Dropout(dropout))
for i in range(num_layers-1):
    mlp_model.add(Dense(num_nodes, activation='sigmoid'))
    mlp_model.add(Dropout(dropout))
mlp_model.add(Dense(1, activation='sigmoid'))
mlp_model.compile(loss=loss_fn, optimizer=optimizer, metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
mc = ModelCheckpoint('mlp_best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
mlp_model.fit(x_train, y_train,
                batch_size=batch_size,
                epochs=num_epochs,
                validation_data=(x_val, y_val),
                callbacks=[es,mc],
                verbose=1)

Train on 13172 samples, validate on 3294 samples
Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.83910, saving model to mlp_best_model.h5
Epoch 2/100

Epoch 00002: val_accuracy improved from 0.83910 to 0.84274, saving model to mlp_best_model.h5
Epoch 3/100

Epoch 00003: val_accuracy improved from 0.84274 to 0.84366, saving model to mlp_best_model.h5
Epoch 4/100

Epoch 00004: val_accuracy did not improve from 0.84366
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.84366
Epoch 6/100

Epoch 00006: val_accuracy did not improve from 0.84366
Epoch 7/100

Epoch 00007: val_accuracy did not improve from 0.84366
Epoch 00007: early stopping


<keras.callbacks.callbacks.History at 0x7fc890db6ac8>

In [154]:
# stacking ensemble

# creating stacked dataset
lasso_pred = best_lasso_model.predict(x_val)
ridge_pred = best_ridge_model.predict(x_val)
rf_pred = rf_model.predict(x_val)
lgb_pred = lgb_model.predict(x_val)
mlp_pred = mlp_model.predict(x_val)
mlp_pred = mlp_pred.reshape(mlp_pred.shape[0])
preds = [lasso_pred, ridge_pred, rf_pred, lgb_pred, mlp_pred]
stacked_X = None
for pred in preds:
    if stacked_X is None:
        stacked_X = pred
    else:
        stacked_X = np.dstack((stacked_X, pred))

stacked_X = stacked_X.reshape(stacked_X.shape[1],stacked_X.shape[2])

In [157]:
# train-test split
cut = stacked_X.shape[0]//5
stacked_x_val = stacked_X[:cut]
stacked_x_train = stacked_X[cut:]
stacked_y_val = y_val[:cut]
stacked_y_train = y_val[cut:]

In [161]:
# use MLP for stacking ensemble
model = Sequential()
model.add(Dense(num_nodes, input_dim=stacked_x_train.shape[1], activation='sigmoid'))
model.add(Dropout(dropout))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss=keras_rmse, optimizer='adam', metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('ensemble_best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
history=model.fit(stacked_x_train, stacked_y_train,
                batch_size=batch_size,
                epochs=100,
                validation_data=(stacked_x_val, stacked_y_val),
               callbacks=[es,mc],
                 verbose=1)
# score, accuracy = model.evaluate(stacked_x_test, stacked_y_test,batch_size=batch_size)
# # print(f'Test score: {score}')
# print(f'Final test accuracy: {accuracy}')

Train on 2636 samples, validate on 658 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 00014: early stopping


In [200]:
# prepare features for prediction
predict = pd.read_csv('Train.csv')
predict = predict.reset_index()
predict.columns = ["square_idx" if x == "index" else x for x in predict.columns]
cols_2019 = [col for col in predict.columns if '2019' in col] + ['square_idx']
predict = predict[cols_2019]
fixed = train[['id','square_idx','poly_idx','poly_elevation','elevation','elev_diff','water_count','water_dist']]
precip = [i for i in range(1,18)]
predict.columns = precip + ['square_idx']
predict['sum'] = predict[precip].apply(lambda x: x.sum(),axis=1)
predict['mean'] = predict[precip].apply(lambda x: x.mean(),axis=1)
predict = predict.merge(fixed,on='square_idx')
poly_rainfall = predict.groupby('poly_idx')['sum'].mean().to_frame().reset_index()
poly_rainfall = poly_rainfall[poly_rainfall.poly_idx!=-1]
poly_rainfall.columns = ['poly_idx','poly_rainfall']
predict = predict.merge(poly_rainfall,on='poly_idx',how='left')
predict.loc[predict.poly_rainfall.isna(),'poly_rainfall'] = predict['sum']
predict['rainfall_diff'] = predict['sum'] - predict['poly_rainfall']

In [206]:
# individual model prediction
pred_features = predict[feature_cols]
lasso_pred = best_lasso_model.predict(pred_features)
ridge_pred = best_ridge_model.predict(pred_features)
rf_pred = rf_model.predict(pred_features)
lgb_pred = lgb_model.predict(pred_features)
mlp_pred = mlp_model.predict(pred_features)
mlp_pred = mlp_pred.reshape(mlp_pred.shape[0])
preds = [lasso_pred, ridge_pred, rf_pred, lgb_pred, mlp_pred]
stacked_X = None
for pred in preds:
    if stacked_X is None:
        stacked_X = pred
    else:
        stacked_X = np.dstack((stacked_X, pred))

stacked_X = stacked_X.reshape(stacked_X.shape[1],stacked_X.shape[2])

In [214]:
# final prediction
predict['target_2019'] = model.predict(stacked_X)
submission = predict[['id','target_2019']]
submission.columns = ['Square_ID','target_2019']

In [223]:
submission.to_csv('submission.csv',index=False)