In [1]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [3]:
useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
df_test = df_test[useful_features]

In [4]:
final_predictions = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    print(fold, "encoding")
    ordinal_encoder = OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    print(fold, "training")
    rf_model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    rf_model.fit(xtrain, ytrain)
    xgb_model = XGBRegressor(random_state=fold, n_jobs=-1)
    xgb_model.fit(xtrain, ytrain) # Your code here
    #model = XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=4, random_state=fold)
    #model.fit(xtrain, ytrain, 
    #             early_stopping_rounds=5, 
    #             eval_set=[(xvalid, yvalid)], 
    #             verbose=False) # Your code here
    
    rf_preds_valid = rf_model.predict(xvalid)
    rf_preds_test = rf_model.predict(xtest)
    
    xgb_preds_valid = xgb_model.predict(xvalid)
    xgb_preds_test = xgb_model.predict(xtest)
    
    preds_valid = (rf_preds_valid + xgb_preds_valid)/2
    preds_test = (rf_preds_test + xgb_preds_test)/2
    
    final_predictions.append(preds_test)
    print(fold, mean_squared_error(yvalid, preds_valid, squared=False))

0 encoding
0 training
0 0.7256159896306812
1 encoding
1 training
1 0.724865787292573
2 encoding
2 training
2 0.7263758004558226
3 encoding
3 training
3 0.7265575202706804
4 encoding
4 training
4 0.7257019338736931


In [5]:
np.column_stack(final_predictions).shape
preds = np.mean(np.column_stack(final_predictions), axis = 1)

In [6]:
sample_submission.target = preds
sample_submission.to_csv('submission.csv', index=False)