In [None]:
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt

def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

%matplotlib inline
#panda is a read/write tool

train_file = "c:\\house\\train.csv"
test_file = "c:\\house\\test.csv"

In [None]:
hsData = pd.read_csv(train_file, header = 0)

hsData = hsData[hsData["GrLivArea"] < 4000]
hsData['Total_FlrSF'] = hsData['1stFlrSF'] + hsData['2ndFlrSF']
hsData = hsData.drop(['1stFlrSF','2ndFlrSF'], axis=1)
hsData = hsData[hsData["Total_FlrSF"] < 3300]
hsData = hsData[hsData["Total_FlrSF"] > 400]

#drop negative correlation
hsData = hsData.drop(['KitchenAbvGr','EnclosedPorch'], axis=1)

hsData['LotFrontage'] = hsData['LotFrontage'].fillna(0)
hsData['MasVnrArea'] = hsData['MasVnrArea'].fillna(0)
hsData['GarageYrBlt'] = hsData['GarageYrBlt'].fillna(0)
hsData['BsmtQual'] = hsData['BsmtQual'].fillna('NA')
hsData['BsmtCond'] = hsData['BsmtCond'].fillna('NA')
hsData['BsmtExposure'] = hsData['BsmtExposure'].fillna('NA')
hsData['FireplaceQu'] = hsData['FireplaceQu'].fillna('NA')
hsData['GarageQual'] = hsData['GarageQual'].fillna('NA')
hsData['GarageCond'] = hsData['GarageCond'].fillna('NA')
hsData['PoolQC'] = hsData['PoolQC'].fillna('NA')
#hsData['Fence'] = hsData['Fence'].fillna('NA')

hsData['MSSubClass'].replace([180,30,45,190,50,90,85,40,160,70,20,75,80,120,60], [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], inplace=True)
hsData['MSZoning'].replace(['C (all)','RM','RH','RL','FV'], [1,2,3,4,5], inplace=True)
hsData['Street'].replace(['Grvl','Pave'], [1,2], inplace=True)

hsData['Neighborhood'].replace(['MeadowV','IDOTRR','BrDale','OldTown','Edwards','BrkSide','Sawyer','Blueste','SWISU','NAmes','NPkVill','Mitchel','SawyerW','Gilbert','NWAmes','Blmngtn','CollgCr','ClearCr','Crawfor','Veenker','Somerst','Timber','StoneBr','NoRidge','NridgHt'], \
                                [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25], inplace=True)

hsData['ExterQual'].replace(['Po','Fa','TA','Gd','Ex'], [1,2,3,4,5], inplace=True)
hsData['ExterCond'].replace(['Po','Fa','TA','Gd','Ex'], [1,2,3,4,5], inplace=True)

hsData['BsmtQual'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)
hsData['BsmtCond'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)
hsData['BsmtExposure'].replace(['NA','No','Mn','Av','Gd'], [0,1,2,3,4], inplace=True)

hsData['HeatingQC'].replace(['Po','Fa','TA','Gd','Ex'], [1,2,3,4,5], inplace=True)
hsData['CentralAir'] = hsData['CentralAir'].apply(lambda air: 1 if air == 'Y' else 0)

hsData['KitchenQual'].replace(['Po','Fa','TA','Gd','Ex'], [1,2,3,4,5], inplace=True)
hsData['FireplaceQu'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)

hsData['GarageQual'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)
hsData['GarageCond'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)
hsData['PavedDrive'].replace(['N','P','Y'], [0,1,2], inplace=True)

hsData['PoolQC'].replace(['NA','Fa','TA','Gd','Ex'], [0,2,3,4,5], inplace=True)
#hsData['Fence'].replace(['NA','MnWw','GdWo','MnPrv','GdPrv'], [0,1,2,3,4], inplace=True)

In [None]:
num_train_features = hsData.select_dtypes(include=[np.number])
#num_train_features = preprocessing.scale(num_train_features)

corr = num_train_features.corr()
print (corr['SalePrice'].sort_values(ascending=False)[:70], '\n')
num_train_features = num_train_features.drop(['Id', 'SalePrice'], axis=1)
#print num_train_features.iloc[:5]

null_columns = num_train_features.columns[num_train_features.isnull().any()]
num_train_features[null_columns].isnull().sum()

In [None]:
hsTest = pd.read_csv(test_file, header = 0)
hsTest['Total_FlrSF'] = hsTest['1stFlrSF'] + hsTest['2ndFlrSF']
hsTest = hsTest.drop(['1stFlrSF','2ndFlrSF'], axis=1)

hsTest = hsTest.drop(['KitchenAbvGr','EnclosedPorch'], axis=1)

hsTest['MSZoning'] = hsTest['MSZoning'].fillna('RM')
hsTest['LotFrontage'] = hsTest['LotFrontage'].fillna(0)
hsTest['MasVnrArea'] = hsTest['MasVnrArea'].fillna(0)
hsTest['GarageYrBlt'] = hsTest['GarageYrBlt'].fillna(0)
hsTest['GarageCars'] = hsTest['GarageCars'].fillna(0)
hsTest['GarageArea'] = hsTest['GarageArea'].fillna(0)
hsTest['BsmtFinSF1'] = hsTest['BsmtFinSF1'].fillna(0)
hsTest['BsmtFinSF2'] = hsTest['BsmtFinSF2'].fillna(0)
hsTest['BsmtUnfSF'] = hsTest['BsmtUnfSF'].fillna(0)
hsTest['TotalBsmtSF'] = hsTest['TotalBsmtSF'].fillna(0)
hsTest['BsmtFullBath'] = hsTest['BsmtFullBath'].fillna(0)
hsTest['BsmtHalfBath'] = hsTest['BsmtHalfBath'].fillna(0)
hsTest['BsmtQual'] = hsTest['BsmtQual'].fillna('NA')
hsTest['BsmtCond'] = hsTest['BsmtCond'].fillna('NA')
hsTest['BsmtExposure'] = hsTest['BsmtExposure'].fillna('NA')
hsTest['KitchenQual'] = hsTest['KitchenQual'].fillna('Po')
hsTest['FireplaceQu'] = hsTest['FireplaceQu'].fillna('NA')
hsTest['GarageQual'] = hsTest['GarageQual'].fillna('NA')
hsTest['GarageCond'] = hsTest['GarageCond'].fillna('NA')
hsTest['PoolQC'] = hsTest['PoolQC'].fillna('NA')
#hsTest['Fence'] = hsTest['Fence'].fillna('NA')


hsTest['MSSubClass'].replace([180,30,45,190,50,90,85,40,160,70,20,75,80,120,60], [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], inplace=True)
hsTest['MSZoning'].replace(['C (all)','RM','RH','RL','FV'], [1,2,3,4,5], inplace=True)
hsTest['Street'].replace(['Grvl','Pave'], [1,2], inplace=True)

hsTest['Neighborhood'].replace(['MeadowV','IDOTRR','BrDale','OldTown','Edwards','BrkSide','Sawyer','Blueste','SWISU','NAmes','NPkVill','Mitchel','SawyerW','Gilbert','NWAmes','Blmngtn','CollgCr','ClearCr','Crawfor','Veenker','Somerst','Timber','StoneBr','NoRidge','NridgHt'], \
                                [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25], inplace=True)

hsTest['ExterQual'].replace(['Po','Fa','TA','Gd','Ex'], [1,2,3,4,5], inplace=True)
hsTest['ExterCond'].replace(['Po','Fa','TA','Gd','Ex'], [1,2,3,4,5], inplace=True)

hsTest['BsmtQual'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)
hsTest['BsmtCond'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)
hsTest['BsmtExposure'].replace(['NA','No','Mn','Av','Gd'], [0,1,2,3,4], inplace=True)

hsTest['HeatingQC'].replace(['Po','Fa','TA','Gd','Ex'], [1,2,3,4,5], inplace=True)
hsTest['CentralAir'] = hsTest['CentralAir'].apply(lambda air: 1 if air == 'Y' else 0)

hsTest['KitchenQual'].replace(['Po','Fa','TA','Gd','Ex'], [1,2,3,4,5], inplace=True)
hsTest['FireplaceQu'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)

hsTest['GarageQual'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)
hsTest['GarageCond'].replace(['NA','Po','Fa','TA','Gd','Ex'], [0,1,2,3,4,5], inplace=True)
hsTest['PavedDrive'].replace(['N','P','Y'], [0,1,2], inplace=True)

hsTest['PoolQC'].replace(['NA','Fa','TA','Gd','Ex'], [0,2,3,4,5], inplace=True)
#hsTest['Fence'].replace(['NA','MnWw','GdWo','MnPrv','GdPrv'], [0,1,2,3,4], inplace=True)
#hsTest.info()

In [None]:
num_test_features = hsTest.select_dtypes(include=[np.number])
num_test_features = num_test_features.drop(['Id'], axis=1)

null_columns=num_test_features.columns[num_test_features.isnull().any()]
num_test_features[null_columns].isnull().sum()

In [None]:
#LassoCV Regressor
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib


X = num_train_features
y = np.log(hsData['SalePrice'])

X_train, X_test, y_train, y_test = train_test_split(X, y)

lscv = LassoCV(eps=10**-7, n_alphas=75)
lscv.fit(X_train, y_train)

X_test_predict = lscv.predict(X_test)

X_test_predict = np.exp(X_test_predict)
y_test = np.exp(y_test)

print(lscv)
print("RMSLE: %s\n" % rmsle(y_test, X_test_predict))

Hs_test_predict = lscv.predict(num_test_features)
Hs_test_predict = np.exp(Hs_test_predict)

plt.grid(True)
plt.scatter(X_test_predict, y_test, alpha=.25,color='b') #alpha helps to show overlapping data
plt.plot([50000,500000],[50000,500000],'r--')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('LassoCV Regressor')
plt.show()

# compare with the almost data
almost_perfect_file = "c:\\house\\almost_perfect.csv"
almostPref = pd.read_csv(almost_perfect_file, header = 0)
alpSlp = almostPref['SalePrice']

kg = rmsle(alpSlp, Hs_test_predict) - 0.0109768754926667
print("Kaggle Possible: %s\n" % kg)