# Using blendering to Make predictions

using models from open source code
- https://www.kaggle.com/hongpeiyi/tuning-xgboost-with-optuna
- https://www.kaggle.com/stevenrferrer/30-days-of-ml-optimized-xgboost-5folds
- https://www.kaggle.com/aditidutta/tutorial-30days-rf-xgb-lgbm-catboost-eda
- https://www.kaggle.com/nitinrajput47/only-notebook-you-need-to-read
- https://www.kaggle.com/abhishek/competition-part-5-blending-101

In [None]:
# Imports
import pandas as pd
import numpy as np
import os

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.linear_model import LinearRegression

In [None]:
# read the data
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
# define useful features and categorical col
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

In [None]:
# params
xgb_1_params = {
    "tree_method": 'gpu_hist',
    "gpu_id": 0,
    "predictor": "gpu_predictor",
    'n_estimators': 9800,
    'max_depth': 2,
    'learning_rate': 0.07363768077193145,
    'gamma': 0.4,
    'min_child_weight': 1,
    'subsample': 0.7912492436244456,
    'colsample_bytree': 0.1613480080803224,
    'reg_alpha': 12.65778876193281,
    'reg_lambda': 50.25603582806218
}

xgb_2_params = {
    'random_state': 1, 
    # gpu
    'tree_method': 'gpu_hist', 
    'gpu_id': 0, 
    'predictor': 'gpu_predictor',
    # cpu
#     'n_jobs': 4,
#     'booster': 'gbtree',
#     'n_estimators': 10000,
    # optimized params
    'learning_rate': 0.03628302216953097,
    'reg_lambda': 0.0008746338866473539,
    'reg_alpha': 23.13181079976304,
    'subsample': 0.7875490025178415,
    'colsample_bytree': 0.11807135201147481,
    'max_depth': 3
}

xgb_3_params = {
    "tree_method": 'gpu_hist',
    "gpu_id": 0,
    "predictor": "gpu_predictor",
    'learning_rate': 0.07853392035787837, 
    'reg_lambda': 1.7549293092194938e-05, 
    'reg_alpha': 14.68267919457715, 
    'subsample': 0.8031450486786944, 
    'colsample_bytree': 0.170759104940733, 
    'max_depth': 3,
    'n_estimators': 5000
}

xgb_4_params = {
    'tree_method':'gpu_hist',         ## parameters for gpu
    'gpu_id':0,                       #
    'predictor':'gpu_predictor',      #
    'n_estimators': 10000,
    'learning_rate': 0.03628302216953097,
    'subsample': 0.7875490025178415,
    'colsample_bytree': 0.11807135201147481,
    'max_depth': 3,
    'booster': 'gbtree', 
    'reg_lambda': 0.0008746338866473539,
    'reg_alpha': 23.13181079976304,
    'n_jobs':-1,
    'random_state':40
}

catb_1_params = {    
    'iterations':1600,
    'learning_rate':0.024,
    'l2_leaf_reg':20,
    'random_strength':1.5,
    'grow_policy':'Depthwise',
    'leaf_estimation_method':'Newton', 
    'bootstrap_type':'Bernoulli',
    'thread_count':4,
    'verbose':False,
    'loss_function':'RMSE',
    'eval_metric':'RMSE',
    'od_type':'Iter'
}

lgbm_params = {
    'metric': 'RMSE',
    'feature_pre_filter': False,
    'lambda_l1': 0.45,
    'lambda_l2': 4.8,
    'learning_rate': 0.005,
    'num_trees': 80000,
    'num_leaves': 10, 
    'feature_fraction': 0.4, 
    'bagging_fraction': 1.0, 
    'bagging_freq': 0, 
    'min_child_samples': 100,
    'num_threads': 4
}

In [None]:
def fit_predict(model, X_train:pd.DataFrame, y_train:pd.DataFrame, 
                X_val:pd.DataFrame, y_val:pd.DataFrame, X_test:pd.DataFrame, lgb:bool=False) -> tuple:
    if lgb: model.fit(X_train, y_train, eval_set = ((X_val, y_val)),verbose = -1, 
                           early_stopping_rounds = 1000,categorical_feature=object_cols) 
    model.fit(X_train, y_train)
    preds_valid = model.predict(X_val)
    test_preds = model.predict(X_test)
    rmse = mean_squared_error(y_val, preds_valid, squared=False)  # false returns RMSE
    return preds_valid, test_preds, rmse

In [None]:
def ordinal_encode(X_train, X_valid, X_test, object_cols=object_cols) -> tuple:
    ordinal_encoder = preprocessing.OrdinalEncoder()
    X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
    X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
    X_test[object_cols] = ordinal_encoder.transform(X_test[object_cols])
    return X_train, X_valid, X_test

In [None]:
def get_fold_data(fold, return_valid_ids=True, df=df) -> tuple:
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    return (xtrain, ytrain, xvalid, yvalid, valid_ids) if return_valid_ids else (xtrain, ytrain, xvalid, yvalid)

In [None]:
def train_loop(model, model_nickname, verbose=True, lgb=False) -> list:
    final_test_predictions = []
    final_valid_predictions = {}
    scores = []
    
    for fold in range(5):
        xtrain, ytrain, xvalid, yvalid, valid_ids = get_fold_data(fold)
        xtest = df_test.copy()
        
        xtrain, xvalid, xtest = ordinal_encode(xtrain, xvalid, xtest)
        
        preds_valid, test_preds, rmse = fit_predict(model, xtrain, ytrain, xvalid, yvalid, xtest, lgb=lgb)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
        
        if verbose: print(f"Fold: {fold}, RMSE: {rmse}")
        scores.append(rmse)
        
    if verbose : print(f"SCORE: mean {np.mean(scores)} | std {np.std(scores)}")
    final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    final_valid_predictions.columns = ["id", f"{model_nickname}_valid_pred"]
    final_valid_predictions.to_csv(f"{VALID_PREDS_DIR}{model_nickname}_valid_pred.csv", index=False)

    sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
    sample_submission.columns = ["id", f"{model_nickname}_test_pred"]
    sample_submission.to_csv(f"{VALID_PREDS_DIR}{model_nickname}_test_pred.csv", index=False)
    
    return scores

In [None]:
# paths for data
VALID_PREDS_DIR = 'valid_preds/'
TEST_PREDS_DIR = 'test_preds/'

os.mkdir(VALID_PREDS_DIR)
os.mkdir(TEST_PREDS_DIR)

In [None]:
models = {
    'xgb1': XGBRegressor(**xgb_1_params),
    'xgb2': XGBRegressor(**xgb_2_params),
    'xgb3': XGBRegressor(**xgb_3_params),
    'xgb4': XGBRegressor(**xgb_4_params),
    'cat': CatBoostRegressor(**catb_1_params),
#     'lgb': LGBMRegressor(**lgbm_params)
}

In [None]:
# train_loop(XGBRegressor(**xgb_1_params), model_nickname='xgb1')
# train_loop(LGBMRegressor(**lgbm_params), 'lgb', lgb=True)

In [None]:
for nickname, model in models.items():
    print("training " + nickname)
    train_loop(model, nickname, lgb=('lgb' in nickname))

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

for pred_file in os.listdir(VALID_PREDS_DIR):
    if '_valid_pred.csv' in pred_file:
        df1 = pd.read_csv(VALID_PREDS_DIR + pred_file)
        df = df.merge(df1, on="id", how="left")
    
for test_file in os.listdir(VALID_PREDS_DIR):
    if 'test_pred.csv' in test_file:
        df_test1 = pd.read_csv(VALID_PREDS_DIR + test_file)
        df_test = df_test.merge(df_test1, on="id", how="left")
    
df.head()

In [None]:
df_test.head()

In [None]:
df.columns

In [None]:
alp=list(df.select_dtypes(include=['float']).columns)
df_copy = df.copy()
df = df_copy.copy()
def diff(columns):
    for col1 in columns:
        for col2 in columns:
#            print(alp)
            df['diffbetween_' + col1 + "_and_" + col2] = df[col1] - df[col2]

print(alp)
diff(alp)

In [None]:
alp=list(df.select_dtypes(include=['float']).columns)
df_copy=df.copy()
def diff(columns):
    for col1 in df.columns:
        for col2 in df.columns:
            print(len(df.columns)**2)
            df['diffbetween' + col1 + "and" + col2] = (lambda col1, col2:  df[col1] - df[col2])
print(alp)
diff(alp)

In [None]:
a1=difference(df.cont0,df.cont1)

In [None]:
a1=df.cont0-df.cont1
a1=abs(a1)
print(a1)
a2=df.cont2-df.cont3
a2=abs(a2)
print(a2)
a3=df.cont4-df.cont5
a3=abs(a3)
print(a3)
a4=df.cont6-df.cont7
a4=abs(a4)
print(a4)
a5=df.cont8-df.cont9
a5=abs(a5)
print(a5)
a6=df.cont9-df.cont10
a6=abs(a6)
print(a6)
a7=df.cont11-df.cont12
a7=abs(a7)
print(a7)
a8=df.cont0-df.cont13
a8=abs(a8)
print(a8)

In [None]:
diff_col = ['a1','a2','a3','a4','a5','a6','a7','a8']
df_merge = pd.merge(df,a1,a2,a3,a4,a5,a6,a7,a8)

In [None]:
useful_features = [c for c in df.columns if 'pred' in c]
useful_features_test = [c for c in df_test.columns if 'pred' in c]
df_test = df_test[useful_features_test]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = XGBRegressor()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
df_test.head()

In [None]:
test = []
for col in df_test.columns:
    test.append(np.array(df_test[col]))
    
test

In [None]:
sample_submission.target = np.mean(np.column_stack(test), axis=1)
sample_submission.to_csv("submission_test.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)