In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error

df = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

df.shape, df_test.shape

In [None]:
df.head()

In [None]:
df_test.head()

In [None]:
sample_submission.head()

In [None]:
n_folds = 10

df["kfold"] = -1

kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=0)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=df)):
    df.loc[valid_indicies, "kfold"] = fold

In [None]:
df.kfold.value_counts()

In [None]:
useful_features = [c for c in df.columns if c not in ("id", "loss", "kfold")]

final_predictions = []

for fold in range(n_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.loss
    yvalid = xvalid.loss
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    xtest = xtest[useful_features]
    
    model = BayesianRidge()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    
    print(fold, mean_squared_error(yvalid, preds_valid, squared=False))

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [None]:
sample_submission.loss = preds
sample_submission.to_csv("submission.csv", index=False)