In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import seaborn as sns
import collections
import matplotlib.pyplot as plt

%matplotlib inline

%run ../src/load_deskew.py

Index(['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'FirstFlrSF',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=383)


In [2]:
def calculate_r_2_for_feature(data,feature):
    new_data = data.drop(feature, axis=1)

    X_train, \
    X_test,  \
    y_train, \
    y_test = train_test_split(
        new_data,data[feature],test_size=0.25
    )

    regressor = KNeighborsRegressor()
    regressor.fit(X_train,y_train)

    score = regressor.score(X_test,y_test)
    return score

def mean_r2_for_feature(data, feature):
    scores = []
    for _ in range(10):
        scores.append(calculate_r_2_for_feature(data, feature))
        
    scores = np.array(scores)
    return scores.mean()

In [3]:
whos DataFrame

Variable             Type         Data/Info
-------------------------------------------
housing_df           DataFrame         MSSubClass MSZoning <...>n[1451 rows x 80 columns]
housing_gelman_df    DataFrame          LotFrontage   LotAr<...>[1451 rows x 383 columns]
housing_log_df       DataFrame          LotFrontage    LotA<...>[1451 rows x 383 columns]
housing_one_hot_df   DataFrame          LotFrontage  LotAre<...>[1451 rows x 383 columns]
listing_df           DataFrame            Id Street Alley  <...>n[1460 rows x 62 columns]
sale_df              DataFrame            Id  MoSold  YrSol<...>\n[1460 rows x 6 columns]
zoning_df            DataFrame            Id  MSSubClass MS<...>n[1460 rows x 15 columns]


In [4]:
r2_results = {}
numeric_cols = housing_df.drop(['SalePrice'], axis=1).select_dtypes(exclude=['category']).columns
housing_final_df = housing_gelman_df.drop(['SalePrice'], axis=1)
print("Calculate R2 Score for {} features".format(len(numeric_cols)))
for col in numeric_cols:
    r2 = mean_r2_for_feature(housing_final_df, col)
    print("{:24} {}".format(col, r2))
    r2_results[col] = r2

Calculate R2 Score for 23 features
LotFrontage              0.4406799056999966
LotArea                  0.5725679569508305
YearBuilt                0.8178916013248297
YearRemodAdd             0.46353382703978
MasVnrArea               0.5186075570091666
BsmtFinSF1               0.408863374855904
BsmtFinSF2               0.3577390107748268
BsmtUnfSF                0.3485324565021652
TotalBsmtSF              0.9399899832741229
FirstFlrSF               0.5722219087974251
SecondFlrSF              0.6877499093738119
LowQualFinSF             0.09911993891189053
GrLivArea                0.6028002107111703
GarageYrBlt              0.6840393398871051
GarageArea               0.8404683636876626
WoodDeckSF               0.04789202204832863
OpenPorchSF              0.22759463266236893
EnclosedPorch            0.026737338616778593
ThreeSsnPorch            -0.5319608731282224
ScreenPorch              -0.14982363493973105
PoolArea                 0.3389204485823748
MiscVal                  0.195783924

In [5]:
sorted_r2_results = [(k, r2_results[k]) for k in sorted(r2_results, key=r2_results.get)]

R2 scores closer to 1 generally mean features that are redundant. Perhaps we could assume those with the smallest R2 scores are the opposite of redundant and have some uniqueness.

In [6]:
for i in range(10):
    print("{}\t{}\t\t{}".format(i, sorted_r2_results[i][0], sorted_r2_results[i][1]))

0	ThreeSsnPorch		-0.5319608731282224
1	ScreenPorch		-0.14982363493973105
2	YrSold		-0.14591362216342585
3	EnclosedPorch		0.026737338616778593
4	WoodDeckSF		0.04789202204832863
5	LowQualFinSF		0.09911993891189053
6	MiscVal		0.19578392439569647
7	OpenPorchSF		0.22759463266236893
8	PoolArea		0.3389204485823748
9	BsmtUnfSF		0.3485324565021652


Let's actually use each feature's correlation to our target, "SalePrice"

In [7]:
corr = housing_gelman_df.corr()
corr['SalePrice-Abs'] = abs(corr['SalePrice'])
corr.sort_values(by=['SalePrice-Abs'], ascending=False)['SalePrice-Abs'][0:11]

SalePrice           1.000000
GrLivArea           0.730620
FirstFlrSF          0.609835
ExterQual_TA        0.595146
YearBuilt           0.584085
FullBath_1          0.573606
YearRemodAdd        0.565244
KitchenQual_TA      0.537961
Foundation_PConc    0.529806
GarageCars_3        0.526388
ExterQual_Gd        0.511701
Name: SalePrice-Abs, dtype: float64