In [1]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv('../input/30days-folds/train_folds.csv')
test = pd.read_csv('../input/30-days-of-ml/test.csv')

In [3]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,0
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,2
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,1


In [4]:
#useful columns
columns = [col for col in train.columns if col not in ('id', 'target', 'kfold')]
#categorical value columns
cat_col = [col for col in columns if 'cat' in col]
test = test[columns]

In [5]:
test.head()

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
0,B,B,B,C,B,B,A,E,E,I,...,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702
1,A,B,A,C,B,C,A,E,C,H,...,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394
2,B,A,A,A,B,B,A,E,D,K,...,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099
3,B,B,A,C,B,D,A,E,A,N,...,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372
4,B,B,A,C,B,C,A,E,C,F,...,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412


In [6]:
#hyper-parameters for the model
xgb_params = {
    'random_state': 1, 
    'n_jobs': 4,
    'booster': 'gbtree',
    'n_estimators': 10000,
    # optimized params
    'learning_rate': 0.03628302216953097,
    'reg_lambda': 0.0008746338866473539,
    'reg_alpha': 23.13181079976304,
    'subsample': 0.7875490025178415,
    'colsample_bytree': 0.11807135201147481,
    'max_depth': 3
}

In [7]:
train[train.kfold != 0]

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,2
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.346010,0.714610,0.540150,0.280682,8.049253,3
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.972260,1
5,7,A,B,A,C,B,D,A,E,G,...,0.658169,0.997473,0.569874,0.960864,0.238050,0.316065,0.731729,0.694719,8.028558,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,B,B,A,A,B,D,A,E,A,...,0.450538,0.934360,1.005077,0.853726,0.422541,1.063463,0.697685,0.506404,7.945605,4
299996,499996,A,B,A,C,B,B,A,E,E,...,0.508502,0.358247,0.257825,0.433525,0.301015,0.268447,0.577055,0.823611,7.326118,3
299997,499997,B,B,A,C,B,C,A,E,G,...,0.372425,0.364936,0.383224,0.551825,0.661007,0.629606,0.714139,0.245732,8.706755,1
299998,499998,A,B,A,C,B,B,A,E,E,...,0.424243,0.382028,0.468819,0.351036,0.288768,0.611169,0.380254,0.332030,7.229569,3


In [8]:
final_prediction = []
scores = []
for fold in range(5):
    X_train= train[train.kfold != fold].reset_index(drop=True)
    X_valid= train[train.kfold == fold].reset_index(drop=True)
    X_test = test.copy()
    
    y_train = X_train.target
    y_valid = X_valid.target
    
    X_train = X_train[columns]
    X_valid = X_valid[columns]
    
    ordinal_encoder = OrdinalEncoder()
    X_train[cat_col] = ordinal_encoder.fit_transform(X_train[cat_col])
    X_valid[cat_col] = ordinal_encoder.fit_transform(X_valid[cat_col])
    X_test[cat_col] = ordinal_encoder.fit_transform(X_test[cat_col])
    
    model = XGBRegressor(**xgb_params)
    model.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose = 1000
    )
    
    preds_valid = model.predict(X_valid)
    preds_test = model.predict(X_test)
    final_prediction.append(preds_test)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False) #root mean square error
    scores.append(rmse)
    print(fold , rmse)
print('mean: {} and std: {}'.format(np.mean(scores), np.std(scores)))

[0]	validation_0-rmse:7.50023
[1000]	validation_0-rmse:0.72306
[2000]	validation_0-rmse:0.71902
[3000]	validation_0-rmse:0.71735
[4000]	validation_0-rmse:0.71647
[5000]	validation_0-rmse:0.71603
[6000]	validation_0-rmse:0.71583
[7000]	validation_0-rmse:0.71568
[7846]	validation_0-rmse:0.71565
0 0.715638919876566
[0]	validation_0-rmse:7.49700
[1000]	validation_0-rmse:0.72274
[2000]	validation_0-rmse:0.71867
[3000]	validation_0-rmse:0.71704
[4000]	validation_0-rmse:0.71631
[5000]	validation_0-rmse:0.71596
[6000]	validation_0-rmse:0.71581
[7000]	validation_0-rmse:0.71574
[7130]	validation_0-rmse:0.71574
1 0.7157314328843097
[0]	validation_0-rmse:7.49484
[1000]	validation_0-rmse:0.72454
[2000]	validation_0-rmse:0.72045
[3000]	validation_0-rmse:0.71893
[4000]	validation_0-rmse:0.71814
[5000]	validation_0-rmse:0.71778
[6000]	validation_0-rmse:0.71763
[7000]	validation_0-rmse:0.71757
[7125]	validation_0-rmse:0.71756
2 0.7175556080736123
[0]	validation_0-rmse:7.49707
[1000]	validation_0-rmse:0

In [9]:
# Use the model to generate predictions
predictions = np.mean(np.column_stack(final_prediction), axis=1)


# Save the predictions to a CSV file
sample_submission = pd.read_csv('../input/30-days-of-ml/sample_submission.csv')
sample_submission.target = predictions
sample_submission.to_csv("submission.csv", index=False)