A naive model

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns # visualization
import xgboost as xgb # GBDT
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor # randomForest, GradientBoosting

from sklearn import preprocessing # categorical values
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from statsmodels.stats.outliers_influence import variance_inflation_factor # VIF

from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from scipy import stats # PCA

import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df_train = pd.read_csv("train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("macro.csv", parse_dates=['timestamp'])

# truncate the extreme values in price_doc
ulimit = np.percentile(df_train.price_doc.values, 99)
llimit = np.percentile(df_train.price_doc.values, 1)
df_train['price_doc'].ix[df_train['price_doc']>ulimit] = ulimit
df_train['price_doc'].ix[df_train['price_doc']<llimit] = llimit

In [None]:
y_train = df_train['price_doc'].values
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

In [None]:
# add month-year count
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# add year
df_all['year'] = df_all.timestamp.dt.year

# add month of year
df_all['month_of_year'] = df_all.timestamp.dt.month

# add week of year
df_all['week_of_year'] = df_all.timestamp.dt.weekofyear

# add day-of-week
df_all['day_of_week'] = df_all.timestamp.dt.dayofweek

# age of building
df_all['build_year'][df_all['build_year']==20052009] = 2005
df_all['build_year'][df_all['build_year']==0] = df_all['build_year'][0]
df_all['build_year'][df_all['build_year']==1] = df_all['build_year'][0]
df_all['build_year'][df_all['build_year']==3] = df_all['build_year'][0]
df_all['build_year'][df_all['build_year']==71] = df_all['build_year'][0]
df_all['build_year'][df_all['build_year']==4965] = df_all['build_year'][0]
df_all['build_year'][df_all['build_year']==20] = 2000
df_all['build_year'][df_all['build_year']==215] = 2015
df_all['age_of_building'] = df_all['year'] - df_all['build_year']

# remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp'], axis=1, inplace=True)
print(df_all.shape)

In [None]:
# floor/max_floor ratio
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)

# num of floor from top 
df_all["floor_from_top"] = df_all["max_floor"] - df_all["floor"]

# kitchen-full ratio
df_all['kitchen_full'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)
df_all['kitchen_full'].ix[df_all['kitchen_full']<0] = 0
df_all['kitchen_full'].ix[df_all['kitchen_full']>1] = 1

# kitchen-living ratio
df_all['kitchen_living'] = df_all['kitch_sq'] / df_all['life_sq'].astype(float)
df_all['kitchen_living'].ix[df_all['kitchen_living']<0] = 0
df_all['kitchen_living'].ix[df_all['kitchen_living']>1] = 1

# add living ratio
df_all['living_ratio'] = df_all['life_sq']/df_all['full_sq']
df_all['living_ratio'].ix[df_all['living_ratio']<0] = 0
df_all['living_ratio'].ix[df_all['living_ratio']>1] = 1

# add non-living area
df_all['non_living'] = df_all['full_sq'] - df_all['life_sq']

# add living area per room
df_all['living_room'] = df_all['life_sq'] / df_all['num_room']

# apartment condition
df_all['state'][df_all['state']==33] = 3

# add preschool ratio
df_all["ratio_preschool"] = df_all["children_preschool"] / df_all["preschool_quota"].astype("float")

# add school ratio
df_all["ratio_school"] = df_all["children_school"] / df_all["school_quota"].astype("float")

print(df_all.shape)

In [None]:
# numerical and categorical data types
df_all_dtype=df_all.dtypes
display_nvar = len(df_all.columns)
df_all_dtype_dict = df_all_dtype.to_dict()
df_all.dtypes.value_counts()

In [None]:
# deal with categorical values
for f in df_all.columns:
    if df_all[f].dtype=='object' and f not in ['sub_area']:
        print(f) # there should be 18 categorical variables
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df_all[f].values.astype('str')) + list(df_all[f].values.astype('str')))
        df_all[f] = lbl.transform(list(df_all[f].values.astype('str')))

In [None]:
# convert to numpy values
df_all.drop(['sub_area'], axis=1, inplace=True)
X_all = df_all.values
print(X_all.shape)

X_train = X_all[:num_train]
X_test = X_all[num_train:]

df_columns = df_all.columns

In [None]:
# use VIF

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=5.0, impute=True, impute_strategy='median'):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = Imputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=5.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped=True
        while dropped:
            # Loop repeatedly until we find that all columns within our dataset
            # have a VIF value we're happy with.
            variables = X.columns
            dropped=False
            vif = []
            new_vif = 0
            for var in X.columns:
                new_vif = variance_inflation_factor(X[variables].values, X.columns.get_loc(var))
                vif.append(new_vif)
                if np.isinf(new_vif):
                    break
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print('Dropping %s with vif= %f' % (X.columns[maxloc],max_vif))
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X
    
#transformer = ReduceVIF()
#X = transformer.fit_transform(df_train[df_train.columns[-10:]], y_train)
#X.head()

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

dtrain = xgb.DMatrix(X_train, y_train, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

In [None]:
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=200, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()
plt.show()

In [None]:
num_boost_rounds = len(cv_output)

In [None]:
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

In [None]:
y_pred = model.predict(dtest)

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})

df_sub.to_csv('sub.csv', index=False)

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=1848)

In [None]:
# sklearn_boost = GradientBoostingRegressor(random_state=1849)
# sklearn_boost.fit(X_train, y_train)
# print('Training Error: {:.3f}'.format(1 - sklearn_boost.score(X_train, y_train)))
# print('Validation Error: {:.3f}'.format(1 - sklearn_boost.score(X_validation, y_validation)))
# %timeit sklearn_boost.fit(X_train, y_train.values.ravel())

In [None]:
# random_forest = RandomForestRegressor(random_state=1852)
# random_forest.fit(X_train, y_train)
# print('Training Error: {:.3f}'.format(1 - random_forest.score(X_train, y_train)))
# print('Validation Error: {:.3f}'.format(1 - random_forest.score(X_validation, y_validation)))
# %timeit random_forest.fit(X_train, y_train.values.ravel())