In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor


data_train = pd.read_csv("../input/30days-folds/train_folds.csv")
data_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")


In [None]:
useful_features = [c for c in data_train.columns if c not in ('id', 'target', 'kfold')]
obj_cols = [col for col in useful_features if 'cat' in col]
X_test = data_test[useful_features]

In [None]:
final_predictions = []
for fold in range(5):
    X_train = data_train[data_train.kfold != fold].reset_index(drop = True)
    X_valid = data_train[data_train.kfold == fold].reset_index(drop = True)
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    print("encoding")
    ordinal_encoder = OrdinalEncoder()
    X_train[obj_cols] = ordinal_encoder.fit_transform(X_train[obj_cols])
    X_valid[obj_cols] = ordinal_encoder.transform(X_valid[obj_cols])
    X_test[obj_cols] = ordinal_encoder.transform(data_test[obj_cols])
    
    print("training")
    model = XGBRegressor(random_state = fold, n_jobs = 4)
    model.fit(X_train, y_train)
    predictions_valid = model.predict(X_valid)
    test_predictions = model.predict(X_test)
    final_predictions.append(test_predictions)
    print("Mean Squared Error :", mean_squared_error(y_valid, predictions_valid, squared = False))

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis = 1)
sample_submission.target = preds
sample_submission.to_csv('submission.csv', index = False)