In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import warnings
#h20
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch

##Sklearn Imports
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#Catboost
from catboost import CatBoostRegressor
#Xgboost
from xgboost import XGBRegressor
import xgboost as xgb
#lightgbm
import lightgbm as lgb


import matplotlib.pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
sns.set_theme(style="whitegrid")

In [None]:
train=pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
test=pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")

In [None]:
train

In [None]:
train.dtypes

In [None]:
train.columns.tolist()

In [None]:
train.drop("id",1,inplace=True)
test_ids=test["id"].values
test.drop("id",1,inplace=True)

In [None]:
sns.heatmap(train.corr())

In [None]:
train.corr()["loss"].map(lambda x:abs(x)).sort_values()

In [None]:
X,y=train.drop("loss",1),train["loss"]
X_train, X_val, y_train, y_val=train_test_split(X,y,stratify=y,random_state=0)

# Creating Base Models and Checking their Raw Performance-No Feature Engineering

In [None]:
Score_dict={"scores":[],"model_names":[]}#A dict to score RMSE scores

In [None]:
def predict(model,model_name):
    model.fit(X_train, y_train)
    y_val_predict = model.predict(X_val)
    score=mean_squared_error(y_val, y_val_predict,squared=False)
    print("The RMSE for {} is {}".format(model_name,score))
    return score # squared= False > returns Root Mean Square Error   

# Boosting Models

## Gradient Boosting Regression

In [None]:
GBR=GradientBoostingRegressor()
Score_dict["scores"].append(predict(model=GBR,model_name="GBR"))
Score_dict["model_names"].append("GBR")

### Cat Boost Regressor

In [None]:
CBR=CatBoostRegressor(task_type="GPU",
                           devices='0')
Score_dict["scores"].append(predict(model=CBR,model_name="CBR"))
Score_dict["model_names"].append("CBR")

## Xgboost

In [None]:
XGB=XGBRegressor(tree_method='gpu_hist',verbosity=2)
Score_dict["scores"].append(predict(model=XGB,model_name="XGB"))
Score_dict["model_names"].append("XGB")

## LightGBM

In [None]:
lgbm_regressor= lgb.LGBMRegressor(objective='regression')
Score_dict["scores"].append(predict(model=lgbm_regressor,model_name="lgbm_regressor"))
Score_dict["model_names"].append("lgbm_regressor")

## H20 Modelling-AUTO ML

In [None]:
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("../input/tabular-playground-series-aug-2021/train.csv")

ss =train.split_frame([0.75],seed = 0)
train = ss[0]
valid = ss[1]

In [None]:
# Identify predictors and response
x = train.columns
y = "loss"
x.remove(y)
x.remove("id")


# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

In [None]:
preds = aml.predict(valid)

In [None]:
preds=h2o.as_list(preds, use_pandas=True).values[:,0]
y_val_h20=h2o.as_list(valid, use_pandas=True).values[:,0]
auto_ml_score=mean_squared_error(y_val_h20, preds,squared=False)
print("The RMSE for Auto ML-h20 is {}".format(auto_ml_score))
Score_dict["scores"].append(auto_ml_score)
Score_dict["model_names"].append("auto_ml")

# Simple Regression Models Performance

## Linear Regression 

In [None]:
linear_regression = make_pipeline(LinearRegression())
Score_dict["scores"].append(predict(model=linear_regression,model_name="linear_regression"))
Score_dict["model_names"].append("linear_regression")

## Lasso Regression 

In [None]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005))
Score_dict["scores"].append(predict(model=lasso,model_name="lasso"))
Score_dict["model_names"].append("lasso")

## ElasticNet Regression

In [None]:
elastic_net = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio= .9))
Score_dict["scores"].append(predict(model=elastic_net,model_name="elastic_net"))
Score_dict["model_names"].append("elastic_net")

# Bagging of Simple Regression Models

In [None]:
def bagging_predictions(estimator):
    regr = BaggingRegressor(base_estimator=estimator,
                            n_estimators=10,
                            max_samples=1.0,
                            bootstrap=True, # Samples are drawn with replacement
                            n_jobs= -1,
                            random_state=0).fit(X_train, y_train)

    y_val_predict = regr.predict(X_val)
    return y_val_predict


predictions = np.column_stack((bagging_predictions(linear_regression),
                              bagging_predictions(lasso),
                              bagging_predictions(elastic_net)))
print(f"Bagged predictions shape: {predictions.shape}")
       
y_val_predict = np.mean(predictions, axis=1)
bagging_sr_score=mean_squared_error(y_val, y_val_predict,squared=False)
print("The RMSE for bagging_sr is {}".format(bagging_sr_score))
Score_dict["scores"].append(bagging_sr_score)
Score_dict["model_names"].append("bagging_sr")

# StackRegressor

In [None]:
estimators = [ ('elastic_net', elastic_net),('xgb_regressor', XGB),('lgbm_regressor', lgbm_regressor) ,("GBR",GBR)]
stack = StackingRegressor(estimators=estimators, final_estimator= lasso, cv= 5, n_jobs= -1, passthrough = True)
Score_dict["scores"].append(predict(model=stack,model_name="stack"))
Score_dict["model_names"].append("stack")

# Final Results

In [None]:
df=pd.DataFrame(Score_dict)

In [None]:
df.sort_values(by="scores")

# Stack is the Winner

In [None]:
# Fitting Stack on the Entire Data and Creating submission.csv file
train=pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
test=pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")
X,y=train.drop(["loss","id"],1),train["loss"]

estimators = [ ('elastic_net', elastic_net),('xgb_regressor', XGB),('lgbm_regressor', lgbm_regressor) ,("GBR",GBR)]
stack = StackingRegressor(estimators=estimators, final_estimator= lasso, cv= 5, n_jobs= -1, passthrough = True)
stack.fit(X,y)
y_test_pred=stack.predict(test.drop("id",1))

In [None]:
submission=pd.DataFrame({
    "id":test["id"],
    "loss":y_test_pred
    
})

In [None]:
submission.to_csv("submission.csv",index=False)