In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split

from sklearn.model_selection import cross_val_score

import xgboost as xgb
import sklearn.preprocessing as pre



In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
#df_y = np.log1p(df_train.price_doc) 
#df_y = df_train.price_doc
y = df_train.price_doc
train_index = df_train.shape[0]
train = df_train.append(df_test, ignore_index = True)
#df_timestamp = pd.to_datetime(train['timestamp']).apply(lambda x: x.date())
train = train.drop(['timestamp', 'price_doc', 'id'], axis=1)



### categorical variables

In [3]:
cat_variables = train.select_dtypes(include = ["object"]).columns

In [4]:
not_binary = ['ecology', 'product_type', 'sub_area']
for col in cat_variables:
    if col not in not_binary:
        train[col] = pd.Series(np.where(train[col].values == 'yes', 1, 0))

In [5]:
train['sub_area'] = pd.Series(np.where(train['sub_area'].value_counts() > 1000, 'areaA', 'areaB'))

In [6]:
categorical_features = train.select_dtypes(include = ["object"]).columns
print("Categorical features : " + str(len(categorical_features)))

train_cat = train[categorical_features]
train_cat.head()

Categorical features : 3


Unnamed: 0,ecology,product_type,sub_area
0,good,Investment,areaA
1,excellent,Investment,areaA
2,poor,Investment,areaA
3,good,Investment,areaA
4,excellent,Investment,areaB


In [7]:
# Create dummy features for categorical values via one-hot encoding
print("NAs for categorical fea0tures in train : " + str(train_cat.isnull().values.sum()))
train_cat = pd.get_dummies(train_cat)
print("Remaining NAs for categorical features in train : " + str(train_cat.isnull().values.sum()))

NAs for categorical fea0tures in train : 38020
Remaining NAs for categorical features in train : 0


In [8]:
train_cat.shape

(38133, 9)

## Numerical variables

In [9]:
numerical_features = train.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(numerical_features)))
train_num = train[numerical_features]

Numerical features : 286


In [10]:
# floor and max floor
def convert_floor(df):
    if df.max_floor == None:
        return df.floor
    if df.max_floor < df.floor:
        return df.floor
    return df.max_floor

train_num['floor'].fillna(train_num['floor'].median(), inplace=True)
train_num['max_floor'] = train_num.apply(convert_floor, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
# full_sq and life_sq
def convert_life_sq(df):
    if df.full_sq == None:
        return 0
    if df.full_sq < df.life_sq:
        return 0
    return df.life_sq

#train['floor'].fillna(train['floor'].median(), inplace=True)
train_num['life_sq'] = train_num.apply(convert_life_sq, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
#material
train_num['material'].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [13]:
#build_year
median = train_num[train_num['build_year'] < 2017.0]['build_year'].median()
train_num.ix[train_num.build_year > 2017.0, 'build_year'] = median
train_num['build_year'].fillna(median, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [14]:
# Handle remaining missing values for numerical features by using median as replacement
print("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
train_num = train_num.fillna(train_num.median())
print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))


NAs for numerical features in train : 283738
Remaining NAs for numerical features in train : 0


In [15]:
#detect outliers
# df = pd.DataFrame(train,columns={'life_sq'})
# df[(np.abs(df.life_sq-df.life_sq.mean())>(5*df.life_sq.std()))]
# df[np.abs(df.life_sq-df.life_sq.mean())<=(5*df.life_sq.std())]
# sr = pd.Series(df['life_sq'])
# sr[((sr-sr.mean()).abs()>30*sr.std())]


In [16]:
train_num.drop(train_num.index[[3527]], inplace=True)
train_num.drop(train_num.index[[13546]], inplace=True)

In [17]:
full = pd.concat([train_num, train_cat], axis = 1)
X_kaggle = full[train_index:]


In [18]:
X_train, X_test, y_train, y_test = train_test_split(full[:train_index], y, test_size = 0.3, random_state = 0)
print("X_train : " + str(X_train.shape))
print("X_test : " + str(X_test.shape))
print("y_train : " + str(y_train.shape))
print("y_test : " + str(y_test.shape))

print(X_kaggle.shape)
print(y.shape)


X_train : (21329, 295)
X_test : (9142, 295)
y_train : (21329,)
y_test : (9142,)
(7662, 295)
(30471,)


In [19]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [20]:
# regressor = xgb.XGBRegressor(colsample_bytree=0.4,
#                 gamma=0.045,
#                 learning_rate=0.07,
#                 max_depth=20,
#                 min_child_weight=1.5,
#                 n_estimators=150,
#                 reg_alpha=0.65,
#                 reg_lambda=0.45,
#                 subsample=0.95)

# regressor.fit(X_train, y_train)
# rmse_cv(regressor).mean()

xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_kaggle)
cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
    verbose_eval=50, show_stdv=False)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()


[0]	train-rmse:8.1662e+06	test-rmse:8.17304e+06
[50]	train-rmse:2.42454e+06	test-rmse:2.88149e+06
[100]	train-rmse:2.0973e+06	test-rmse:2.69624e+06
[150]	train-rmse:1.97072e+06	test-rmse:2.65131e+06
[200]	train-rmse:1.87853e+06	test-rmse:2.62566e+06
[250]	train-rmse:1.80265e+06	test-rmse:2.61627e+06
[300]	train-rmse:1.73356e+06	test-rmse:2.60823e+06
[350]	train-rmse:1.67407e+06	test-rmse:2.60476e+06


<matplotlib.axes._subplots.AxesSubplot at 0x104563470>

In [21]:
num_boost_rounds = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds)

In [22]:
# y_pred = regressor.predict(X_kaggle)
# y_pred

y_pred = model.predict(dtest)
y_pred

array([ 5321260. ,  8537232. ,  5325671. , ...,  5453746.5,  5419136.5,
        8099579.5], dtype=float32)

In [24]:
test2 = pd.read_csv("test.csv")
# submission = pd.DataFrame({
#     "id": test2["id"],
#     "price_doc": np.round( (np.e ** (np.e **y_pred - 1) - 1), 2)
# })

submission = pd.DataFrame({
    "id": test2["id"],
    "price_doc": np.round( y_pred, 2)
})

submission.to_csv("sn_submission.csv", index=False)