In [329]:
%matplotlib inline
import pandas as pd
import numpy as np
import random

In [330]:
def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    
    return np.sqrt(np.mean(calc))

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [331]:
# Determine Column Lists based on Feature Data Types
all_features = 'MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition'.split(',')

numeric_features = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','LowQualFinSF',
                    'GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr',
                    'KitchenAbvGr','TotRmsAbvGrd','TotalBsmtSF','Fireplaces', 'GarageCars', 'GarageArea',
                    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

categorical_features = [f for f in all_features if not(f in numeric_features)]

(len(all_features), len(categorical_features), len(numeric_features))

(79, 53, 26)

In [332]:
# Load Data
df_train = pd.read_csv('data/train.csv')
df_train = df_train.set_index('Id')

df_test = pd.read_csv('data/test.csv')
df_test = df_test.set_index('Id')

# Merge Data
df_all = pd.concat([df_train, df_test])
print('All data({0[0]},{0[1]})'.format(df_all.shape))
df_all.head()

All data(2919,80)


Unnamed: 0_level_0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,856,854,0,,3,1Fam,TA,No,706.0,0.0,...,WD,0,Pave,8,856.0,AllPub,0,2003,2003,2008
2,1262,0,0,,3,1Fam,TA,Gd,978.0,0.0,...,WD,0,Pave,6,1262.0,AllPub,298,1976,1976,2007
3,920,866,0,,3,1Fam,TA,Mn,486.0,0.0,...,WD,0,Pave,6,920.0,AllPub,0,2001,2002,2008
4,961,756,0,,3,1Fam,Gd,No,216.0,0.0,...,WD,0,Pave,7,756.0,AllPub,0,1915,1970,2006
5,1145,1053,0,,4,1Fam,TA,Av,655.0,0.0,...,WD,0,Pave,9,1145.0,AllPub,192,2000,2000,2008


In [333]:
# Split Columns based on data type
df_numeric = df_all[numeric_features]
numeric_column_names = df_numeric.columns
df_categorical = df_all[categorical_features]

In [334]:
# Show Continous Data
print('Numeric data({0[0]},{0[1]})'.format(df_numeric.shape))
df_numeric.head()

Numeric data(2919,26)


Unnamed: 0_level_0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,65.0,8450,196.0,706.0,0.0,150.0,0,1710,1.0,0.0,...,0,2.0,548.0,0,61,0,0,0,0,0
2,80.0,9600,0.0,978.0,0.0,284.0,0,1262,0.0,1.0,...,1,2.0,460.0,298,0,0,0,0,0,0
3,68.0,11250,162.0,486.0,0.0,434.0,0,1786,1.0,0.0,...,1,2.0,608.0,0,42,0,0,0,0,0
4,60.0,9550,0.0,216.0,0.0,540.0,0,1717,1.0,0.0,...,1,3.0,642.0,0,35,272,0,0,0,0
5,84.0,14260,350.0,655.0,0.0,490.0,0,2198,1.0,0.0,...,1,3.0,836.0,192,84,0,0,0,0,0


In [335]:
# Show Category Data
print('Categorical data({0[0]},{0[1]})'.format(df_categorical.shape))
df_categorical.head()

Categorical data(2919,53)


Unnamed: 0_level_0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,TA,TA,Y,,,,2,2008,WD,Normal
2,20,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,TA,TA,Y,,,,5,2007,WD,Normal
3,60,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,TA,TA,Y,,,,9,2008,WD,Normal
4,70,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,TA,TA,Y,,,,2,2006,WD,Abnorml
5,60,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,TA,TA,Y,,,,12,2008,WD,Normal


In [336]:
# Impute Missing Numeric Values
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
numeric_data = imp.fit_transform(df_numeric)

# Normalize Numeric Features
from sklearn.preprocessing import Normalizer
norm = Normalizer(copy=False)
numeric_data = norm.fit_transform(numeric_data)

df_numeric = pd.DataFrame(columns=numeric_column_names, data=numeric_data, index=df_numeric.index)

print('({0[0]},{0[1]})'.format(df_numeric.shape))
df_numeric.head()

(2919,26)


Unnamed: 0_level_0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.00746,0.969747,0.022494,0.081023,0.0,0.017214,0.0,0.196245,0.000115,0.0,...,0.0,0.00023,0.06289,0.0,0.007001,0.0,0.0,0.0,0.0,0.0
2,0.008136,0.976297,0.0,0.09946,0.0,0.028882,0.0,0.128342,0.0,0.000102,...,0.000102,0.000203,0.046781,0.030306,0.0,0.0,0.0,0.0,0.0,0.0
3,0.005932,0.981324,0.014131,0.042393,0.0,0.037857,0.0,0.155791,8.7e-05,0.0,...,8.7e-05,0.000174,0.053035,0.0,0.003664,0.0,0.0,0.0,0.0,0.0
4,0.006138,0.976978,0.0,0.022097,0.0,0.055243,0.0,0.175651,0.000102,0.0,...,0.000102,0.000307,0.065677,0.0,0.003581,0.027826,0.0,0.0,0.0,0.0
5,0.005782,0.981626,0.024093,0.045089,0.0,0.03373,0.0,0.151305,6.9e-05,0.0,...,6.9e-05,0.000207,0.057548,0.013217,0.005782,0.0,0.0,0.0,0.0,0.0


In [337]:
# Vectorize Categorical Features
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()

for column in df_categorical:
    dt = df_categorical[column].dtype
    colum_index = df_categorical.columns.get_loc(df_categorical[column].name)
    
    # Clean-up NaN's in the Categorical data
    if df_categorical[column].dtype == int or df_categorical[column].dtype == float:
        df_categorical[column].fillna(0, inplace=True)
    else:
        df_categorical[column].fillna("", inplace=True)

    df_categorical[column] = label_enc.fit_transform(df_categorical[column])

print('({0[0]},{0[1]})'.format(df_categorical.shape))
df_categorical.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(2919,53)


Unnamed: 0_level_0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,4,1,0,3,3,1,4,0,5,...,5,5,2,0,0,0,1,2,9,4
2,0,4,1,0,3,3,1,2,0,24,...,5,5,2,0,0,0,4,1,9,4
3,5,4,1,0,0,3,1,4,0,5,...,5,5,2,0,0,0,8,2,9,4
4,6,4,1,0,0,3,1,0,0,6,...,5,5,2,0,0,0,1,0,9,0
5,5,4,1,0,0,3,1,2,0,15,...,5,5,2,0,0,0,11,2,9,4


In [338]:
# Join Continuous and Categorical data into one set
df_processed = pd.concat([df_categorical, df_numeric], axis=1)

print('({0[0]},{0[1]})'.format(df_processed.shape))
df_processed.head()

(2919,79)


Unnamed: 0_level_0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,Fireplaces,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,4,1,0,3,3,1,4,0,5,...,0.0,0.00023,0.06289,0.0,0.007001,0.0,0.0,0.0,0.0,0.0
2,0,4,1,0,3,3,1,2,0,24,...,0.000102,0.000203,0.046781,0.030306,0.0,0.0,0.0,0.0,0.0,0.0
3,5,4,1,0,0,3,1,4,0,5,...,8.7e-05,0.000174,0.053035,0.0,0.003664,0.0,0.0,0.0,0.0,0.0
4,6,4,1,0,0,3,1,0,0,6,...,0.000102,0.000307,0.065677,0.0,0.003581,0.027826,0.0,0.0,0.0,0.0
5,5,4,1,0,0,3,1,2,0,15,...,6.9e-05,0.000207,0.057548,0.013217,0.005782,0.0,0.0,0.0,0.0,0.0


In [344]:
# Split back into Train and Test Datasets
df_processed_train = df_processed[:1460]
df_processed_test = df_processed[1460:]

# Add SalePrice to Train Dataset
#df_processed_train.join(df_train['SalePrice'])

print('({0[0]},{0[1]})'.format(df_processed_train.shape))
print('({0[0]},{0[1]})'.format(df_processed_test.shape))

(1460,80)
(1459,79)


In [345]:
# Divide the Training Data into Train and Validation sets
split_point = int(len(df_processed_train) * 0.8)
data = df_processed_train.reindex(np.random.permutation(df_processed_train.index))

train_data = data[:split_point]
val_data = data[split_point:]

print('Training data({0[0]},{0[1]})'.format(train_data.shape))
print(train_data.head())
print()
print('Validation data({0[0]},{0[1]})'.format(val_data.shape))
print(val_data.head())

Training data(1168,80)
      MSSubClass  MSZoning  Street  Alley  LotShape  LandContour  Utilities  \
Id                                                                            
915           13         2       1      2         3            3          1   
1066           5         4       1      0         3            3          1   
1153           0         4       1      0         0            3          1   
170            0         4       1      0         0            3          1   
1363           4         4       1      0         0            3          1   

      LotConfig  LandSlope  Neighborhood    ...      GarageCars  GarageArea  \
Id                                          ...                               
915           4          0            21    ...        0.000592    0.156376   
1066          4          1             4    ...        0.000140    0.037258   
1153          4          0             6    ...        0.000140    0.032873   
170           0          0  

In [353]:
# Select the K-Best Features

In [360]:
# Extract X(with 1 feature) and Y from the train and test sets
X_train = train_data
X_train = X_train.drop('SalePrice', axis = 1)
y_train = train_data['SalePrice']

print('Training X({0[0]},{0[1]})'.format(X_train.shape))
print('Training Y({0[0]})'.format(y_train.shape))

X_val = val_data
X_val = X_val.drop('SalePrice', axis = 1)
y_val = val_data['SalePrice']

print('Validation X({0[0]},{0[1]})'.format(X_val.shape))
print('Validation Y({0[0]})'.format(y_val.shape))

Training X(1168,79)
Training Y(1168)
Validation X(292,79)
Validation Y(292)


In [361]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [362]:
# Generate Metrics on Validation Set
from sklearn.metrics import mean_squared_error

print("Validation Metrics")
r2 = lin_reg.score(X_val, y_val)
print('R-Squared: ', r2)

y_pred = lin_reg.predict(X_val)
rmsle_val = rmsle(y_val, y_pred)
rmse_val = rmse(y_val, y_pred)
print('Root Mean Squared Logarithmic Error: ', rmsle_val)
print('Root Mean Squared Error: ', rmse_val)

Validation Metrics
R-Squared:  0.752147235226
Root Mean Squared Logarithmic Error:  0.194923045893
Root Mean Squared Error:  42002.8217127


In [368]:
# Load Test Dataset
X_test = df_processed_test
y_pred_test = lin_reg.predict(X_test)

In [369]:
# Generate Test Set Results
pred = pd.DataFrame()
pred['Id'] = X_test.index
pred['SalePrice'] = y_pred_test.tolist()

print('Results data({0[0]},{0[1]})'.format(pred.shape))
print(pred.head())

Results data(1459,2)
     Id      SalePrice
0  1461  118906.967643
1  1462  164725.171238
2  1463  164448.286756
3  1464  173047.090533
4  1465  200453.320843


In [370]:
# Save Output
pred.to_csv('submission.csv', index=False)