A quick extension. It was reported in here on the forums that the first 10k rows in the train set have more missing values(likely some part of the data source missing for these?). Removing these from the comparison might be useful. Thanks to Gerard Toonstra for pointing this out in this comment https://www.kaggle.com/c/sberbank-russian-housing-market/discussion/32312#179239

Second thing that seems interesting is to see if the first 35% of the test set can be differentiated from the last 65% which correspond to the public and private leaderboard split. 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.metrics import roc_auc_score

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
# We start by loading the training / test data and combining them with minimal preprocessing necessary
# Most of the data preparation is taken from here: 
# https://www.kaggle.com/bguberfain/naive-xgb-lb-0-317
xtrain = pd.read_csv('../input/train.csv')
id_train = xtrain['id']

In [None]:
xtrain.shape

In [None]:
xtrain['num_room'].isnull().plot()

In [None]:
# lazy way
xtrain.loc[11000:,'num_room'].isnull().value_counts()

In [None]:
xtrain = xtrain.iloc[11000:,:]

In [None]:
time_train = xtrain['timestamp']
ytrain = xtrain['price_doc']
xtrain.drop(['id', 'timestamp', 'price_doc'], axis = 1, inplace = True)
xtrain.fillna(-1, inplace = True)

In [None]:
xtest = pd.read_csv('../input/test.csv')
id_test = xtest['id']            
time_test = xtest['timestamp']

In [None]:
xtest.isnull().sum().sum() # still nulls in test set

In [None]:
#fillna same way as train in the test set
xtest.fillna(-1, inplace = True)
xtest.drop(['id', 'timestamp'], axis = 1, inplace = True)

In [None]:
# add identifier and combine
xtrain['istrain'] = 1
xtest['istrain'] = 0
xdat = pd.concat([xtrain, xtest], axis = 0)

# convert non-numerical columns to integers
df_numeric = xdat.select_dtypes(exclude=['object'])
df_obj = xdat.select_dtypes(include=['object']).copy()
    
for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]
    
xdat = pd.concat([df_numeric, df_obj], axis=1)
y = xdat['istrain']; xdat.drop('istrain', axis = 1, inplace = True)

Define a split and the model (xgboost, what else :-)

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 44)
xgb_params = {
        'learning_rate': 0.05, 'max_depth': 4,'subsample': 0.9,
        'colsample_bytree': 0.9,'objective': 'binary:logistic',
        'silent': 1, 'n_estimators':100, 'gamma':1,
        'min_child_weight':4
        }   
clf = xgb.XGBClassifier(**xgb_params, seed = 10)     

Calculate the AUC for each fold

In [None]:
for train_index, test_index in skf.split(xdat, y):
        x0, x1 = xdat.iloc[train_index], xdat.iloc[test_index]
        y0, y1 = y.iloc[train_index], y.iloc[test_index]        
        print(x0.shape)
        clf.fit(x0, y0, eval_set=[(x1, y1)],
               eval_metric='logloss', verbose=False,early_stopping_rounds=10)
                
        prval = clf.predict_proba(x1)[:,1]
        print(roc_auc_score(y1,prval))

compared to full train set only a small change in auc

In [None]:
split_ind = int(xtest.shape[0] * 0.35)

In [None]:
# I have not varified that the public/private split is time based. It should be though.
public_test = xtest.iloc[:split_ind,:]
private_test = xtest.iloc[split_ind:,:]

In [None]:
xdat = pd.concat([xtrain, public_test], axis = 0)

# convert non-numerical columns to integers
df_numeric = xdat.select_dtypes(exclude=['object'])
df_obj = xdat.select_dtypes(include=['object']).copy()
    
for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]
    
xdat = pd.concat([df_numeric, df_obj], axis=1)
y = xdat['istrain']; xdat.drop('istrain', axis = 1, inplace = True)

for train_index, test_index in skf.split(xdat, y):
        x0, x1 = xdat.iloc[train_index], xdat.iloc[test_index]
        y0, y1 = y.iloc[train_index], y.iloc[test_index]        
        print(x0.shape)
        clf.fit(x0, y0, eval_set=[(x1, y1)],
               eval_metric='logloss', verbose=False,early_stopping_rounds=10)
                
        prval = clf.predict_proba(x1)[:,1]
        print(roc_auc_score(y1,prval))


In [None]:
# now compare train to private test set
xdat = pd.concat([xtrain, private_test], axis = 0)

# convert non-numerical columns to integers
df_numeric = xdat.select_dtypes(exclude=['object'])
df_obj = xdat.select_dtypes(include=['object']).copy()
    
for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]
    
xdat = pd.concat([df_numeric, df_obj], axis=1)
y = xdat['istrain']; xdat.drop('istrain', axis = 1, inplace = True)

for train_index, test_index in skf.split(xdat, y):
        x0, x1 = xdat.iloc[train_index], xdat.iloc[test_index]
        y0, y1 = y.iloc[train_index], y.iloc[test_index]        
        print(x0.shape)
        clf.fit(x0, y0, eval_set=[(x1, y1)],
               eval_metric='logloss', verbose=False,early_stopping_rounds=10)
                
        prval = clf.predict_proba(x1)[:,1]
        print(roc_auc_score(y1,prval))

In [None]:
public_test['istrain'] = 1

# compare public and private test
xdat = pd.concat([public_test, private_test], axis = 0)

# convert non-numerical columns to integers
df_numeric = xdat.select_dtypes(exclude=['object'])
df_obj = xdat.select_dtypes(include=['object']).copy()
    
for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]
    
xdat = pd.concat([df_numeric, df_obj], axis=1)
y = xdat['istrain']; xdat.drop('istrain', axis = 1, inplace = True)

for train_index, test_index in skf.split(xdat, y):
        x0, x1 = xdat.iloc[train_index], xdat.iloc[test_index]
        y0, y1 = y.iloc[train_index], y.iloc[test_index]        
        print(x0.shape)
        clf.fit(x0, y0, eval_set=[(x1, y1)],
               eval_metric='logloss', verbose=False,early_stopping_rounds=10)
                
        prval = clf.predict_proba(x1)[:,1]
        print(roc_auc_score(y1,prval))


Some of the variation is very likely accounted for by differences in the size(and content of the folds) of the datasets in the each case. Checking against similar sized subsets of the full train set would be a good additional validation. As well as replicating with additional seeds, would be helpful.