In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import seaborn as sns
import collections
import matplotlib.pyplot as plt

%matplotlib inline

%run ../src/load_deskew.py

Index(['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'FirstFlrSF',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=383)


In [8]:
def calculate_r_2_for_feature(data,feature):
    new_data = data.drop(feature, axis=1)

    X_train, \
    X_test,  \
    y_train, \
    y_test = train_test_split(
        new_data,data[feature],test_size=0.25
    )

    regressor = KNeighborsRegressor()
    regressor.fit(X_train,y_train)

    score = regressor.score(X_test,y_test)
    return score

def mean_r2_for_feature(data, feature):
    scores = []
    for _ in range(10):
        scores.append(calculate_r_2_for_feature(data, feature))
        
    scores = np.array(scores)
    return scores.mean()

In [9]:
whos DataFrame

Variable             Type         Data/Info
-------------------------------------------
housing_df           DataFrame         MSSubClass MSZoning <...>n[1451 rows x 80 columns]
housing_gelman_df    DataFrame          LotFrontage   LotAr<...>[1451 rows x 383 columns]
housing_log_df       DataFrame          LotFrontage    LotA<...>[1451 rows x 383 columns]
housing_one_hot_df   DataFrame          LotFrontage  LotAre<...>[1451 rows x 383 columns]
listing_df           DataFrame            Id Street Alley  <...>n[1460 rows x 62 columns]
sale_df              DataFrame            Id  MoSold  YrSol<...>\n[1460 rows x 6 columns]
zoning_df            DataFrame            Id  MSSubClass MS<...>n[1460 rows x 15 columns]


In [20]:
r2_results = {}
numeric_cols = housing_df.drop(['SalePrice'], axis=1).select_dtypes(exclude=['category']).columns
housing_final_df = housing_gelman_df.drop(['SalePrice'], axis=1)
print("Calculate R2 Score for {} features".format(len(numeric_cols)))
for col in numeric_cols:
    r2 = mean_r2_for_feature(housing_final_df, col)
    print("{:24} {}".format(col, r2))
    r2_results[col] = r2

Calculate R2 Score for 23 features
LotFrontage              0.4902102266981682
LotArea                  0.5761973829020407
YearBuilt                0.7734672266705804
YearRemodAdd             0.4757391392873644
MasVnrArea               0.5271057387751288
BsmtFinSF1               0.3985476290524557
BsmtFinSF2               0.34418415758647253
BsmtUnfSF                0.3491859831335409
TotalBsmtSF              0.9316670058990933
FirstFlrSF               0.59631510716913
SecondFlrSF              0.6604871906069445
LowQualFinSF             -0.0372734505756959
GrLivArea                0.6051714217817106
GarageYrBlt              0.6570027155680227
GarageArea               0.7761418633183859
WoodDeckSF               0.028579978838880117
OpenPorchSF              0.22376461196591863
EnclosedPorch            0.028541079896101208
ThreeSsnPorch            -0.550267233098837
ScreenPorch              -0.1633613889705809
PoolArea                 0.26849220217671504
MiscVal                  0.2591835

In [22]:
sorted_r2_results = [(k, r2_results[k]) for k in sorted(r2_results, key=r2_results.get)]

R2 scores closer to 1 generally mean features that are redundant features. Let's use the features with the smallest R2 scores for useful features.

In [24]:
for i in range(10):
    print("{}\t{}\t\t{}".format(i, sorted_r2_results[i][0], sorted_r2_results[i][1]))

0	ThreeSsnPorch		-0.550267233098837
1	ScreenPorch		-0.1633613889705809
2	YrSold		-0.14599955286409808
3	LowQualFinSF		-0.0372734505756959
4	EnclosedPorch		0.028541079896101208
5	WoodDeckSF		0.028579978838880117
6	OpenPorchSF		0.22376461196591863
7	MiscVal		0.2591835299963877
8	PoolArea		0.26849220217671504
9	BsmtFinSF2		0.34418415758647253


Let's actually use each feature's correlation to our target, "SalePrice"

In [42]:
corr = housing_gelman_df.corr()
corr['SalePrice-Abs'] = abs(corr['SalePrice'])
corr.sort_values(by=['SalePrice-Abs'], ascending=False)['SalePrice-Abs'][0:11]

SalePrice           1.000000
GrLivArea           0.730620
FirstFlrSF          0.609835
ExterQual_TA        0.595146
YearBuilt           0.584085
FullBath_1          0.573606
YearRemodAdd        0.565244
KitchenQual_TA      0.537961
Foundation_PConc    0.529806
GarageCars_3        0.526388
ExterQual_Gd        0.511701
Name: SalePrice-Abs, dtype: float64