In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import lightgbm as lgbm

In [None]:
df = pd.read_csv("../input/train-folds-5/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

# Training the models on the prediction set from the base level
xgb_df = pd.read_csv('../input/1st-layer-model-stacking/xgb_train_pred (1).csv')
xgb_df.columns = ["id", "xgbpred"]

lgb_df = pd.read_csv('../input/1st-layer-model-stacking/lgb_train_pred (1).csv')
lgb_df.columns = ["id", "lgbpred"]

gbr_df = pd.read_csv('../input/1st-layer-model-stacking/gbr_train_pred (1).csv')
gbr_df.columns= ["id", "gbrpred"]

rf_df = pd.read_csv('../input/1st-layer-model-stacking/rf_train_pred (2).csv')
rf_df.columns = ['id', 'rfpred']

lass_df = pd.read_csv('../input/ridge-and-lasso-preds-30-days-of-ml/lass_train_pred.csv')
lass_df.columns = ['id','lasspred']

ridge_df = pd.read_csv('../input/ridge-and-lasso-preds-30-days-of-ml/ridge_train_pred.csv')
ridge_df.columns = ['id','ridgepred']

xgb_df_test = pd.read_csv('../input/1st-layer-model-stacking/xgb_test_pred (1).csv')
xgb_df_test.columns = ["id", "xgbpred"]

rf_df_test = pd.read_csv('../input/1st-layer-model-stacking/rf_test_pred (2).csv')
rf_df_test.columns = ['id', 'rfpred']

lgb_df_test = pd.read_csv('../input/1st-layer-model-stacking/lgb_test_pred (1).csv')
lgb_df_test.columns = ["id", "lgbpred"]

gbr_df_test = pd.read_csv('../input/1st-layer-model-stacking/gbr_test_pred (2).csv')
gbr_df_test.columns = ["id", "gbrpred"]

lass_df_test = pd.read_csv('../input/ridge-and-lasso-preds-30-days-of-ml/lass_test_pred.csv')
lass_df_test.columns = ['id','lasspred']

ridge_df_test = pd.read_csv('../input/ridge-and-lasso-preds-30-days-of-ml/ridge_train_pred.csv')
ridge_df_test.columns = ['id','ridgepred']
# *** Merging dataframes
df = df.merge(xgb_df, on="id", how="left")
df = df.merge(lgb_df, on="id", how="left")
df = df.merge(rf_df, on="id", how="left")
df = df.merge(gbr_df,on="id", how="left" )
df = df.merge(lass_df,on="id", how="left")
df = df.merge (ridge_df,on="id", how="left" )
df_test = df_test.merge(xgb_df_test, on='id',how="left")
df_test = df_test.merge(lgb_df_test, on='id', how="left")
df_test = df_test.merge(rf_df_test, on='id', how="left")
df_test = df_test.merge(gbr_df_test, on='id', how="left")
df_test = df_test.merge(lass_df_test, on='id', how="left")
df_test = df_test.merge(ridge_df_test, on='id', how="left")


In [None]:
df.head()

In [None]:
df_test.head()

In [None]:
# Return any column with missing values. No columns with missing values found
df.columns[df.isnull().any()]

In [None]:
#Histograms for each var
df_hist = df.hist(bins=10,figsize=(10,10))

In [None]:
#creating a heatmap to show correlation
fig,axes = plt.subplots(1,1,figsize=(16,14))
sns.heatmap(df.corr(),annot=True, cmap="RdYlGn")
plt.show()

In [None]:
# Select categorical vars only

df_cat = df.select_dtypes(include = 'object').copy()
# counts of each var value
df_cat.nunique()

In [None]:
# Calculating the numbers of each unique values for each categorical var using lambda expression
df_cat.apply(lambda x:x.value_counts()).T.stack()


In [None]:
kept_features = ['xgbpred', 'lgbpred','rfpred','gbrpred','lasspred','ridgepred']
df_test = df_test[kept_features]

In [None]:
xgb_params ={'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 10000,
        'learning_rate': 0.03628302216953097,
        'reg_lambda': 0.0008746338866473539,
        'reg_alpha': 23.13181079976304,
        'subsample': 0.7875490025178415,
        'colsample_bytree': 0.11807135201147481,
        'max_depth': 3
             }

In [None]:
df = pd.read_csv("../input/train-folds-5/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
kept_features = [useful_cols for useful_cols in df.columns if useful_cols not in ('id','kfold','target')]
object_cols = [col for col in kept_features if 'cat' in col]
# Creating a var for numerical columns for feature engineering
numerical_cols = [col for col in kept_features if 'cont' in col]
df_test=df_test[kept_features]

In [None]:
#Building an XGBoost model for stacking

xgb_final_test_predictions = []
xgb_final_valid_predictions = {}
scores = []
# creating a for loop to loop over fold, reserving -1 fold for training data
for fold in range(5):
    # training data that is not at fold
    x_train = df[df.kfold != fold].reset_index(drop=True)
    #training data that is at fold
    x_valid = df[df.kfold == fold ].reset_index(drop=True)
    # making a copy of the test set to avoid errors
    x_test = df_test.copy()
    #creating a list of validation set's indices
    valid_id = x_valid.id.values.tolist()
    # creating the training dataset, validation dataset and test dataset
    y_train= x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[kept_features]
    x_valid = x_valid[kept_features]
    # ordinal-encode categorical columns
    OE = preprocessing.OrdinalEncoder()
    
    # always fit_transform on the training data
    x_train[object_cols] = OE.fit_transform(x_train[object_cols])
    
    #transform on the validation and test sets
    x_valid[object_cols] = OE.transform(x_valid[object_cols])
    x_test[object_cols] = OE.transform(x_test[object_cols])
    
    # Feature engineering for numerical var : standardisation
    
    scaler = preprocessing.StandardScaler()
    # standardise training data, using .fit_transform()
    x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
    #standardise validation and test data using .transform()
    x_valid[numerical_cols] = scaler.transform(x_valid[numerical_cols])
    x_test[numerical_cols] = scaler.transform(x_test[numerical_cols])
    
    
    #using XGBRegressor 
    xgb_model = XGBRegressor(**xgb_params,
                              n_jobs =4
                              )
    xgb_model.fit(x_train, y_train, early_stopping_rounds=300, eval_set=[(x_valid, y_valid)], verbose=1000)
    xgb_preds_valid = xgb_model.predict(x_valid)
    xgb_test_preds = xgb_model.predict(x_test)
    xgb_final_test_predictions.append(xgb_test_preds)
    xgb_final_valid_predictions.update(dict(zip(valid_id, xgb_preds_valid)))
    RMSE = mean_squared_error(y_valid, xgb_preds_valid, squared=False)
    print(fold, RMSE)
    scores.append(RMSE)
# Printing out of the loop 
print(np.mean(scores), np.std(scores))
xgb_final_valid_predictions = pd.DataFrame.from_dict(xgb_final_valid_predictions, orient="index").reset_index()
xgb_final_valid_predictions.columns = ["id", "xgbpred_1"]
xgb_final_valid_predictions.to_csv("xgb_train_pred_1.csv", index=False)

submission.target = np.mean(np.column_stack(xgb_final_test_predictions), axis=1)
submission.columns = ["id", "xgbpred_1"]
submission.to_csv("xgb_test_pred_1.csv", index=False)

In [None]:
lgb_params = { "objective": "regression",
             "metric": "rmse",
             "boosting_type": "gbdt",
             "n_estimators": 10000,
             "early_stopping_round": 300,
   'colsample_tree': 0.9966937316093348,
 'learning_rate': 0.23205382167938451,
 'max_depth': 2,
 'reg_alpha': 21.312138571025006,
 'reg_lambda': 4.4379149320083925e-08,
 'subsample': 0.21475469764965427}

In [None]:
#Building a LightGBM
import lightgbm as lgbm

lgb_final_test_predictions = []
lgb_final_valid_predictions = {}
scores = []
# creating a for loop to loop over fold, reserving -1 fold for training data
for fold in range(5):
    # training data that is not at fold
    x_train = df[df.kfold != fold].reset_index(drop=True)
    #training data that is at fold
    x_valid = df[df.kfold == fold ].reset_index(drop=True)
    # making a copy of the test set to avoid errors
    x_test = df_test.copy()

    valid_id = x_valid.id.values.tolist()
    # creating the training dataset, validation dataset and test dataset
    y_train= x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[kept_features]
    x_valid = x_valid[kept_features]
     # ordinal-encode categorical columns
    OE = preprocessing.OrdinalEncoder()
    
    # always fit_transform on the training data
    x_train[object_cols] = OE.fit_transform(x_train[object_cols])
    
    #transform on the validation and test sets
    x_valid[object_cols] = OE.transform(x_valid[object_cols])
    x_test[object_cols] = OE.transform(x_test[object_cols])
    
    # Feature engineering for numerical var : standardisation
    
    scaler = preprocessing.StandardScaler()
    # standardise training data, using .fit_transform()
    x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
    #standardise validation and test data using .transform()
    x_valid[numerical_cols] = scaler.transform(x_valid[numerical_cols])
    x_test[numerical_cols] = scaler.transform(x_test[numerical_cols])
    lgb_train = lgbm.Dataset(x_train, y_train)
    lgb_valid = lgbm.Dataset(x_valid, y_valid, reference = lgb_train)
   
    #using LGBM  
    lgb_model = lgbm.train(lgb_params, lgb_train, 
                                 valid_sets = [lgb_valid],
                                  verbose_eval = 1000
                              )
  
    lgb_preds_valid = lgb_model.predict(x_valid)
    lgb_test_preds = lgb_model.predict(x_test)
    lgb_final_test_predictions.append(lgb_test_preds)
    lgb_final_valid_predictions.update(dict(zip(valid_id, lgb_preds_valid)))
    print(mean_squared_error(y_valid, lgb_preds_valid, squared=False))
    RMSE = mean_squared_error(y_valid, lgb_preds_valid, squared=False)
    print(fold,RMSE)
    scores.append(RMSE)

print (np.mean(scores),np.std(scores))
lgb_final_valid_predictions = pd.DataFrame.from_dict(lgb_final_valid_predictions, orient="index").reset_index()
lgb_final_valid_predictions.columns = ["id", "lgbpred_1"]
lgb_final_valid_predictions.to_csv("lgb_train_pred_1.csv", index=False)

submission.target = np.mean(np.column_stack(lgb_final_test_predictions), axis=1)
submission.columns = ["id", "lgbpred_1"]
submission.to_csv("lgb_test_pred_1.csv", index=False)


In [None]:
rf_params = {'min_samples_split': 2,
             'min_samples_leaf': 4, 
             'max_features': 'sqrt', 
             'max_depth': None, 
             'bootstrap': False}

In [None]:
# Building a random forest model

rf_final_test_predictions = []
rf_final_valid_predictions = {}
scores =[]

for fold in range(5):
    # training data that is not at fold
    x_train = df[df.kfold != fold].reset_index(drop=True)
    #training data that is at fold
    x_valid = df[df.kfold == fold ].reset_index(drop=True)
    # making a copy of the test set to avoid errors
    x_test = df_test.copy()
    #creating a list of validation set's indices
    valid_id = x_valid.id.values.tolist()
    # creating the training dataset, validation dataset and test dataset
    y_train= x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[kept_features]
    x_valid = x_valid[kept_features]
     # ordinal-encode categorical columns
    OE = preprocessing.OrdinalEncoder()
    
    # always fit_transform on the training data
    x_train[object_cols] = OE.fit_transform(x_train[object_cols])
    
    #transform on the validation and test sets
    x_valid[object_cols] = OE.transform(x_valid[object_cols])
    x_test[object_cols] = OE.transform(x_test[object_cols])
    
    # Feature engineering for numerical var : standardisation
    
    scaler = preprocessing.StandardScaler()
    # standardise training data, using .fit_transform()
    x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
    #standardise validation and test data using .transform()
    x_valid[numerical_cols] = scaler.transform(x_valid[numerical_cols])
    x_test[numerical_cols] = scaler.transform(x_test[numerical_cols])
    #using RandomForestRegressor
    rf_model = RandomForestRegressor(**rf_params, random_state=1)
    rf_model.fit(x_train, y_train)
    rf_preds_valid = rf_model.predict(x_valid)
    rf_test_preds = rf_model.predict(x_test)
    rf_final_test_predictions.append(rf_test_preds)
    # use function zip() to create a zip object and store it in a dictionary
    rf_final_valid_predictions.update(dict(zip(valid_id, rf_preds_valid)))
    RMSE = mean_squared_error(y_valid, rf_preds_valid, squared=False)
    print(fold,RMSE)
    scores.append(RMSE)
print (np.mean(scores),np.std(scores))
rf_final_valid_predictions = pd.DataFrame.from_dict(rf_final_valid_predictions, orient='index').reset_index()
rf_final_valid_predictions.columns = ['id', 'rfpred_1']
rf_final_valid_predictions.to_csv('rf_train_pred_1.csv', index=False)
    
submission.target = np.mean(np.column_stack(rf_final_test_predictions), axis=1)
submission.columns = ['id', 'rfpred_1']
submission.to_csv ("rf_test_pred_1.csv",index=False)

In [None]:
gbr_params= {'learning_rate': 0.021030330096493244, 
 'max_depth': 4, 'n_estimators': 968, 
 'subsample': 0.579236968137102}

In [None]:
#Building a Gradient Boosting Regressor

from sklearn.ensemble import GradientBoostingRegressor

gbr_final_test_predictions = []
gbr_final_valid_predictions = {}
scores = []
# creating a for loop to loop over fold, reserving -1 fold for training data
for fold in range(5):
    # training data that is not at fold
    x_train = df[df.kfold != fold].reset_index(drop=True)
    #training data that is at fold
    x_valid = df[df.kfold == fold ].reset_index(drop=True)
    # making a copy of the test set to avoid errors
    x_test = df_test.copy()

    valid_id = x_valid.id.values.tolist()
    # creating the training dataset, validation dataset and test dataset
    y_train= x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[kept_features]
    x_valid = x_valid[kept_features]
     # ordinal-encode categorical columns
    OE = preprocessing.OrdinalEncoder()
    
    # always fit_transform on the training data
    x_train[object_cols] = OE.fit_transform(x_train[object_cols])
    
    #transform on the validation and test sets
    x_valid[object_cols] = OE.transform(x_valid[object_cols])
    x_test[object_cols] = OE.transform(x_test[object_cols])
    
    # Feature engineering for numerical var : standardisation
    
    scaler = preprocessing.StandardScaler()
    # standardise training data, using .fit_transform()
    x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
    #standardise validation and test data using .transform()
    x_valid[numerical_cols] = scaler.transform(x_valid[numerical_cols])
    x_test[numerical_cols] = scaler.transform(x_test[numerical_cols])
   
    #using GBR  
    gbr_model = GradientBoostingRegressor(**gbr_params, random_state=1)
    gbr_model.fit(x_train,y_train)
                      
  
    gbr_preds_valid = gbr_model.predict(x_valid)
    gbr_test_preds = gbr_model.predict(x_test)
    gbr_final_test_predictions.append(gbr_test_preds)
    gbr_final_valid_predictions.update(dict(zip(valid_id, gbr_preds_valid)))
    print(mean_squared_error(y_valid, gbr_preds_valid, squared=False))
    RMSE = mean_squared_error(y_valid, gbr_preds_valid, squared=False)
    print(fold,RMSE)
    scores.append(RMSE)

print (np.mean(scores),np.std(scores))
gbr_final_valid_predictions = pd.DataFrame.from_dict(gbr_final_valid_predictions, orient="index").reset_index()
gbr_final_valid_predictions.columns = ["id", "gbrpred_1"]
gbr_final_valid_predictions.to_csv("gbr_train_pred_1.csv", index=False)

submission.target = np.mean(np.column_stack(gbr_final_test_predictions), axis=1)
submission.columns = ["id", "gbrpred_1"]
submission.to_csv("gbr_test_pred_1.csv", index=False)

In [None]:
# Lasso regression
from sklearn.linear_model import Lasso
lass_final_test_predictions = []
lass_final_valid_predictions = {}
scores = []
# creating a for loop to loop over fold, reserving -1 fold for training data
for fold in range(5):
    # training data that is not at fold
    x_train = df[df.kfold != fold].reset_index(drop=True)
    #training data that is at fold
    x_valid = df[df.kfold == fold ].reset_index(drop=True)
    # making a copy of the test set to avoid errors
    x_test = df_test.copy()
    valid_id = x_valid.id.values.tolist()
    # creating the training dataset, validation dataset and test dataset
    y_train= x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[kept_features]
    x_valid = x_valid[kept_features]
     # ordinal-encode categorical columns
    OE = preprocessing.OrdinalEncoder()
    
    # always fit_transform on the training data
    x_train[object_cols] = OE.fit_transform(x_train[object_cols])
    
    #transform on the validation and test sets
    x_valid[object_cols] = OE.transform(x_valid[object_cols])
    x_test[object_cols] = OE.transform(x_test[object_cols])
    
    # Feature engineering for numerical var : standardisation
    
    scaler = preprocessing.StandardScaler()
    # standardise training data, using .fit_transform()
    x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
    #standardise validation and test data using .transform()
    x_valid[numerical_cols] = scaler.transform(x_valid[numerical_cols])
    x_test[numerical_cols] = scaler.transform(x_test[numerical_cols])
    #using Lasso 
    lass_model = Lasso(alpha = 0.0,
                              random_state=0)
    lass_model.fit(x_train, y_train)
    lass_preds_valid = lass_model.predict(x_valid)
    lass_test_preds = lass_model.predict(x_test)
    print(mean_squared_error(y_valid, lass_preds_valid, squared=False))
    lass_final_test_predictions.append(lass_test_preds)
    lass_final_valid_predictions.update(dict(zip(valid_id, lass_preds_valid)))
    RMSE = mean_squared_error(y_valid, lass_preds_valid, squared=False)
    print(fold,RMSE)
    scores.append(RMSE)

    print (np.mean(scores),np.std(scores))
print (np.mean(scores),np.std(scores))
lass_final_valid_predictions = pd.DataFrame.from_dict(lass_final_valid_predictions, orient="index").reset_index()
lass_final_valid_predictions.columns = ["id", "lasspred_1"]
lass_final_valid_predictions.to_csv("lass_train_pred_1.csv", index=False)

submission.target = np.mean(np.column_stack(lass_final_test_predictions), axis=1)
submission.columns = ["id", "lasspred_1"]
submission.to_csv("lass_test_pred_1.csv", index=False)

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge,RidgeCV
ridge_final_test_predictions = []
ridge_final_valid_predictions = {}
scores = []
# creating a for loop to loop over fold, reserving -1 fold for training data
for fold in range(5):
    # training data that is not at fold
    x_train = df[df.kfold != fold].reset_index(drop=True)
    #training data that is at fold
    x_valid = df[df.kfold == fold ].reset_index(drop=True)
    # making a copy of the test set to avoid errors
    x_test = df_test.copy()
    valid_id = x_valid.id.values.tolist()
    # creating the training dataset, validation dataset and test dataset
    y_train= x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[kept_features]
    x_valid = x_valid[kept_features]
     # ordinal-encode categorical columns
    OE = preprocessing.OrdinalEncoder()
    
    # always fit_transform on the training data
    x_train[object_cols] = OE.fit_transform(x_train[object_cols])
    
    #transform on the validation and test sets
    x_valid[object_cols] = OE.transform(x_valid[object_cols])
    x_test[object_cols] = OE.transform(x_test[object_cols])
    
    # Feature engineering for numerical var : standardisation
    
    scaler = preprocessing.StandardScaler()
    # standardise training data, using .fit_transform()
    x_train[numerical_cols] = scaler.fit_transform(x_train[numerical_cols])
    #standardise validation and test data using .transform()
    x_valid[numerical_cols] = scaler.transform(x_valid[numerical_cols])
    x_test[numerical_cols] = scaler.transform(x_test[numerical_cols])
    #using Lasso 
    ridge_model = Ridge(alpha = 0.0,
                              random_state=0)
    ridge_model.fit(x_train, y_train)
    ridge_preds_valid = ridge_model.predict(x_valid)
    ridge_test_preds = ridge_model.predict(x_test)
    print(mean_squared_error(y_valid, ridge_preds_valid, squared=False))
    ridge_final_test_predictions.append(ridge_test_preds)
    ridge_final_valid_predictions.update(dict(zip(valid_id, ridge_preds_valid)))
    RMSE = mean_squared_error(y_valid, ridge_preds_valid, squared=False)
    print(fold,RMSE)
    scores.append(RMSE)

print (np.mean(scores),np.std(scores))

ridge_final_valid_predictions = pd.DataFrame.from_dict(ridge_final_valid_predictions, orient="index").reset_index()
ridge_final_valid_predictions.columns = ["id", "ridgepred_1"]
ridge_final_valid_predictions.to_csv("ridge_train_pred_1.csv", index=False)

submission.target = np.mean(np.column_stack(ridge_final_test_predictions), axis=1)
submission.columns = ["id", "ridgepred_1"]
submission.to_csv("ridge_test_pred_1.csv", index=False)

In [None]:
#creating submission
df = pd.read_csv("../input/train-folds-5/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

xgb_df = pd.read_csv('xgb_train_pred_1.csv')
rf_df = pd.read_csv('rf_train_pred_1.csv')

xgb_df_test = pd.read_csv('xgb_test_pred_1.csv')
rf_df_test = pd.read_csv('rf_test_pred_1.csv')

lgb_df = pd.read_csv('lgb_train_pred_1.csv')
lgb_df_test = pd.read_csv('lgb_test_pred_1.csv')

gbr_df = pd.read_csv('gbr_train_pred_1.csv')
gbr_df_test = pd.read_csv('gbr_test_pred_1.csv')

lass_df = pd.read_csv('lass_train_pred_1.csv')
lass_df_test = pd.read_csv('lass_test_pred_1.csv')

ridge_df = pd.read_csv('ridge_train_pred_1.csv')
ridge_df_test = pd.read_csv('ridge_test_pred_1.csv')

# *** Merging dataframes
df = df.merge(xgb_df, on="id", how="left")
df = df.merge(lgb_df, on="id", how="left")
df = df.merge(rf_df, on="id", how="left")
df = df.merge(gbr_df,on="id", how="left" )
df = df.merge(lass_df,on="id", how="left")
df = df.merge(ridge_df,on="id", how="left")

df_test = df_test.merge(xgb_df_test, on='id',how="left")
df_test = df_test.merge(lgb_df_test, on='id', how="left")
df_test = df_test.merge(rf_df_test, on='id', how="left")
df_test = df_test.merge(gbr_df_test, on='id', how="left")
df_test = df_test.merge(lass_df_test, on='id', how="left")
df_test = df_test.merge(ridge_df_test, on='id', how="left")

In [None]:
# Building a Linear Regression model 

kept_features = ['xgbpred_1','lgbpred_1', 'rfpred_1', 'gbrpred_1', 'lasspred_1', 'ridgepred_1']
df_test = df_test[kept_features]
final_predictions = []
scores = []

for fold in range(5):
   # training data that is not at fold
    x_train = df[df.kfold != fold].reset_index(drop=True)
    #training data that is at fold
    x_valid = df[df.kfold == fold ].reset_index(drop=True)
    # making a copy of the test set to avoid errors
    x_test = df_test.copy()

    valid_id = x_valid.id.values.tolist()
    # creating the training dataset, validation dataset and test dataset
    y_train= x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[kept_features]
    x_valid = x_valid[kept_features]

    model = LinearRegression()
    model.fit(x_train, y_train)
    test_pred = model.predict(x_test)
    valid_pred = model.predict(x_valid)
    final_predictions.append(test_pred)
    RMSE = mean_squared_error(y_valid, valid_pred, squared=False)
    print (fold, RMSE)
    scores.append(RMSE)
print(np.mean(scores), np.std(scores))

In [None]:
preds = np.mean(np.column_stack( final_predictions), axis = 1)

submission.target = preds
submission.to_csv('submission.csv', index=False)