In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Importing required libraries

from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
#Reading training dataset.
df_train = pd.read_csv("../input/30-days-of-ml/train.csv")

#Adding a new column kfold to store the fold value i.e. 0, 1, 2, 3, 4
df_train["kfold"] = -1

#Creating KFold with splits 5
folds_count = 5

kf = model_selection.KFold(n_splits=folds_count, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=df_train)):
    df_train.loc[valid_indicies, "kfold"] = fold
    
df_train.to_csv("30days-train_5folds.csv", index=False)

In [None]:
#Reading datasets

#train_folds data has been created from train.csv using KFold cross validation method with K=5.

df = pd.read_csv("30days-train_5folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
df_train = pd.read_csv("../input/30-days-of-ml/train.csv")

In [None]:
#Preparing list of important features exlcuding id, target and kfold.
required_features = [col for col in df.columns if col not in ("id", "target", "kfold")]

#Preparing list of categorical columns
cat_cols = [col for col in required_features if 'cat' in col]

#selecting df_test with 
df_test = df_test[required_features]

In [None]:
#Array of final predicttions
final_preds = []

for fold in range(folds_count):
    #Selecting all other folds' data except current training data (xtrain).
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    #Selecting current fold's data as validation data (xvalid).
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    
    #Copying test dataset as xtest.
    xtest = df_test.copy()
    
    #Selecting train's target.
    ytrain = xtrain.target
    #Selecting validation's target.
    yvalid = xvalid.target
    
    #Selecting train dataset (including only the fetures).
    xtrain = xtrain[required_features]
    #Selecting validation dataset (including only the fetures).
    xvalid = xvalid[required_features]
    
    #By looking into the data we are not sure whether to select One hot encodnig or Ordinal encoding.
    #We decided to use the ordiinal encoding for now to transform categorical features.
    encoder = OrdinalEncoder()
    
    xtrain[cat_cols] = encoder.fit_transform(xtrain[cat_cols])
    xvalid[cat_cols] = encoder.transform(xvalid[cat_cols])
    xtest[cat_cols] = encoder.transform(xtest[cat_cols])
    
    #Creating XGBoost model
    model = XGBRegressor(random_state=fold, n_jobs=4)
    model.fit(xtrain, ytrain)
        
    #Predicting validation and tesst data.
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    
    #Adding test prredictions into final_preds array.
    final_preds.append(test_preds)
    
    #Displaying root mean square error 
    print(fold, mean_squared_error(yvalid, valid_preds, squared=False))
    
    

In [None]:
preds = np.mean(np.column_stack(final_preds), axis=1)

In [None]:
sample_submission.target = preds
sample_submission.to_csv("submission.csv", index=False)