In [60]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [75]:
train = pd.read_csv('../input/train.csv', index_col='Id')
test = pd.read_csv('../input/test.csv', index_col='Id')

In [76]:
# constants
SKEW = 0.2

In [77]:
from scipy.stats import skew
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

train["SalePrice"] = np.log1p(train["SalePrice"])

In [78]:
# new feature: Valid Area
flrArea = all_data['1stFlrSF'] + all_data['2ndFlrSF']
validArea = flrArea + 0.7 * all_data.TotalBsmtSF + all_data.GarageArea
all_data['validArea'] = validArea

In [79]:
object_feats = all_data.dtypes[all_data.dtypes == object].index
all_data[object_feats] = all_data[object_feats].fillna('NA')

In [80]:
# new feature: quality / condition score
qual = ['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']
cond = ['ExterCond', 'BsmtCond', 'GarageCond']

grade = {'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1, 'NA' : 0}

all_data[qual] = all_data[qual].apply(lambda x : x.map(grade))
all_data[cond] = all_data[cond].apply(lambda x : x.map(grade))

In [81]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index


skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > SKEW]
skewed_feats = skewed_feats.index


all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())

In [82]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [83]:
from sklearn.cross_validation import cross_val_score

def rmse(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y=y, scoring='mean_squared_error'))
    return rmse.mean()

def get_result(model, desc):
    y_pred = np.expm1(model.predict(test_df.values))
    output_name = '../output/' + desc + '.csv'
    output(output_name, y_pred)

def output(path, y_pred):
    out_df = pd.DataFrame(y_pred, index=test_df.index, columns=['SalePrice'])
    out_df.to_csv(path)

In [84]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 283 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(273), int64(10)
memory usage: 3.2 MB


In [85]:
from sklearn.ensemble import ExtraTreesRegressor

rf = ExtraTreesRegressor(n_estimators=100)
rf.fit(X_train, np.log1p(train.SalePrice))
importance = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)

In [92]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.0004)

model = lasso

rmse(model)

0.12352478058288101

In [97]:
from sklearn.ensemble import GradientBoostingRegressor

gbdt = GradientBoostingRegressor(n_estimators=300)

rmse(gbdt)

0.12513573449081108

In [103]:
lasso.coef_

In [106]:
a = lasso.fit(X_train, y)
coef = pd.Series(a.coef_, index=all_data.columns)
coef.sort_values(ascending=False)

validArea                0.291525
OverallCond              0.240400
GrLivArea                0.230246
Neighborhood_StoneBr     0.128182
Neighborhood_Crawfor     0.111007
Neighborhood_NridgHt     0.100705
Neighborhood_NoRidge     0.083459
LotArea                  0.072468
Functional_Typ           0.067375
Exterior1st_BrkFace      0.063181
RoofMatl_WdShngl         0.062597
KitchenQual              0.057000
OverallQual              0.054805
Neighborhood_BrkSide     0.050710
Neighborhood_Somerst     0.048859
Condition1_Norm          0.046994
BsmtExposure_Gd          0.041031
GarageCars               0.037912
BsmtFullBath             0.034102
HalfBath                 0.033396
Fireplaces               0.032968
BldgType_1Fam            0.032265
GarageQual               0.028676
SaleType_New             0.028290
LotConfig_CulDSac        0.027256
FireplaceQu_Ex           0.027055
HeatingQC_Ex             0.026734
FullBath                 0.021734
Heating_GasW             0.021691
BsmtQual      

In [123]:
test.index

Int64Index([1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470,
            ...
            2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919],
           dtype='int64', name=u'Id', length=1459)

In [124]:
### prediction
model.fit(X_train, y)

preds = np.expm1(model.predict(X_test))
solution = pd.DataFrame({"Id":test.index, "SalePrice":preds})
solution.to_csv("../output/valid_area.csv", index = False)


In [57]:
X_train.Cond_Qual_score

Id
1       36
2       36
3       36
4       35
5       37
6       32
7       38
8       35
9       32
10      33
11      31
12      41
13      32
14      36
15      32
16      36
17      34
18      24
19      32
20      32
21      38
22      36
23      37
24      34
25      36
26      37
27      34
28      38
29      32
30      29
        ..
1431    34
1432    34
1433    29
1434    33
1435    31
1436    39
1437    31
1438    40
1439    34
1440    34
1441    36
1442    35
1443    42
1444    29
1445    36
1446    32
1447    33
1448    37
1449    30
1450    30
1451    26
1452    38
1453    32
1454    26
1455    36
1456    33
1457    34
1458    42
1459    33
1460    33
Name: Cond_Qual_score, dtype: int64