In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('../input/30-days-folds/train_folds.csv')
df_test = pd.read_csv('../input/30-days-of-ml/test.csv')
sample_submission = pd.read_csv('../input/30-days-of-ml/sample_submission.csv')

In [3]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0.5
1,5,0.5
2,15,0.5
3,16,0.5
4,17,0.5


In [4]:
useful_features = [c for c in df.columns if c not in ("id", "target", "KFold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

In [5]:
final_predictions = []

for fold in range(5):
    xtrain = df[df.KFold != fold].reset_index(drop = True)
    xvalid = df[df.KFold == fold].reset_index(drop = True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model = XGBRegressor(learning_rate = 0.01, n_estimators = 1000, random_state = fold, n_jobs = 4, tree_method = 'gpu_hist')
    model.fit(xtrain, ytrain, early_stopping_rounds = 10, 
             eval_set = [(xvalid, yvalid)], verbose = False)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    print(fold, mean_squared_error(yvalid, preds_valid, squared = False))

0 0.7247923811142797
1 0.7241133589849816
2 0.7263524460695452
3 0.7263178053613838
4 0.7255365896297232


In [6]:
final_predictions

[array([8.069618 , 8.256509 , 8.376792 , ..., 8.334432 , 8.085208 ,
        8.1107025], dtype=float32),
 array([8.050384, 8.285192, 8.368636, ..., 8.376536, 8.109329, 8.078075],
       dtype=float32),
 array([7.9667954, 8.284657 , 8.379866 , ..., 8.353053 , 8.117113 ,
        8.08416  ], dtype=float32),
 array([7.966408, 8.28086 , 8.377582, ..., 8.358804, 8.087908, 8.108522],
       dtype=float32),
 array([8.052264, 8.307722, 8.346991, ..., 8.350653, 8.102838, 8.160831],
       dtype=float32)]

In [7]:
a = [[1, 2, 5, 9], [4, 8, 6, 12]]

print(a)
print(np.column_stack(a))
b = np.column_stack(a)
b.shape
print(np.sum(b, axis = 1))

[[1, 2, 5, 9], [4, 8, 6, 12]]
[[ 1  4]
 [ 2  8]
 [ 5  6]
 [ 9 12]]
[ 5 10 11 21]


In [8]:
np.column_stack(final_predictions).shape

(200000, 5)

In [9]:
predictions_submit = np.mean(np.column_stack(final_predictions), axis = 1)

In [10]:
sample_submission.target = predictions_submit
sample_submission.to_csv("submission.csv", index = False)

In [11]:
sample_submission

Unnamed: 0,id,target
0,0,8.021094
1,5,8.282988
2,15,8.369974
3,16,8.398341
4,17,8.212493
...,...,...
199995,499987,8.138060
199996,499990,8.356192
199997,499991,8.354695
199998,499994,8.100479
