In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
import seaborn as sns
import collections
import matplotlib.pyplot as plt

%matplotlib inline

%run ../src/load_deskew.py

In [2]:
def calculate_r_2_for_feature(data,feature):
    new_data = data.drop(feature, axis=1)

    X_train, \
    X_test,  \
    y_train, \
    y_test = train_test_split(
        new_data,data[feature],test_size=0.25
    )

    regressor = KNeighborsRegressor()
    regressor.fit(X_train,y_train)

    score = regressor.score(X_test,y_test)
    return score

def mean_r2_for_feature(data, feature):
    scores = []
    for _ in range(10):
        scores.append(calculate_r_2_for_feature(data, feature))
        
    scores = np.array(scores)
    return scores.mean()

In [3]:
whos DataFrame

Variable             Type         Data/Info
-------------------------------------------
housing_df           DataFrame         MSSubClass MSZoning <...>n[1451 rows x 80 columns]
housing_num_df       DataFrame          LotFrontage  LotAre<...>n[1451 rows x 24 columns]
housing_one_hot_df   DataFrame          LotFrontage   LotAr<...>[1451 rows x 383 columns]
housing_orig_df      DataFrame         MSSubClass MSZoning <...>n[1451 rows x 80 columns]
listing_df           DataFrame            Id Street Alley  <...>n[1460 rows x 62 columns]
numeric_data_df      DataFrame          LotFrontage  LotAre<...>n[1451 rows x 24 columns]
numeric_log_df       DataFrame          LotFrontage    LotA<...>n[1451 rows x 24 columns]
numeric_scaled_df    DataFrame          LotFrontage   LotAr<...>n[1451 rows x 24 columns]
sale_df              DataFrame            Id  MoSold  YrSol<...>\n[1460 rows x 6 columns]
zoning_df            DataFrame            Id  MSSubClass MS<...>n[1460 rows x 15 columns]


In [4]:
r2_results = {}
numeric_cols = housing_df.drop(['SalePrice'], axis=1).select_dtypes(exclude=['category']).columns
housing_final_df = housing_one_hot_df.drop(['SalePrice'], axis=1)
print("Calculate R2 Score for {} features".format(len(numeric_cols)))
for col in numeric_cols:
    r2 = mean_r2_for_feature(housing_final_df, col)
    print("{:24} {}".format(col, r2))
    r2_results[col] = r2

Calculate R2 Score for 23 features
LotFrontage              0.5093780376764045
LotArea                  0.5863224260278666
YearBuilt                0.8793809829006021
YearRemodAdd             0.47296660422379383
MasVnrArea               0.6037667257459874
BsmtFinSF1               0.5583438373590152
BsmtFinSF2               0.20174062083856442
BsmtUnfSF                0.40752723966851023
TotalBsmtSF              0.8774897114201746
FirstFlrSF               0.6634279688884662
SecondFlrSF              0.843154964033684
LowQualFinSF             -0.2499857053400143
GrLivArea                0.7070475560427545
GarageYrBlt              0.7030192545109573
GarageArea               0.8549521700289683
WoodDeckSF               0.031538033494748687
OpenPorchSF              0.23718816387045122
EnclosedPorch            0.09182654521345544
ThreeSsnPorch            -0.27326100227795813
ScreenPorch              -0.10572528954908829
PoolArea                 0.06131942922586968
MiscVal                  -0.0

In [5]:
sorted_r2_results = [(k, r2_results[k]) for k in sorted(r2_results, key=r2_results.get)]

R2 scores closer to 1 generally mean features that are redundant. Perhaps we could assume those with the smallest R2 scores are the opposite of redundant and have some uniqueness.

In [6]:
for i in range(len(sorted_r2_results)):
    print("{}\t{}\t\t{}".format(i, sorted_r2_results[i][0], sorted_r2_results[i][1]))

0	ThreeSsnPorch		-0.27326100227795813
1	LowQualFinSF		-0.2499857053400143
2	YrSold		-0.14300685108151281
3	ScreenPorch		-0.10572528954908829
4	MiscVal		-0.08900988395681936
5	WoodDeckSF		0.031538033494748687
6	PoolArea		0.06131942922586968
7	EnclosedPorch		0.09182654521345544
8	BsmtFinSF2		0.20174062083856442
9	OpenPorchSF		0.23718816387045122
10	BsmtUnfSF		0.40752723966851023
11	YearRemodAdd		0.47296660422379383
12	LotFrontage		0.5093780376764045
13	BsmtFinSF1		0.5583438373590152
14	LotArea		0.5863224260278666
15	MasVnrArea		0.6037667257459874
16	FirstFlrSF		0.6634279688884662
17	GarageYrBlt		0.7030192545109573
18	GrLivArea		0.7070475560427545
19	SecondFlrSF		0.843154964033684
20	GarageArea		0.8549521700289683
21	TotalBsmtSF		0.8774897114201746
22	YearBuilt		0.8793809829006021


Let's actually use each feature's correlation to our target, "SalePrice"

In [7]:
corr = housing_one_hot_df.corr()
corr['SalePrice-Abs'] = abs(corr['SalePrice'])
corr.sort_values(by=['SalePrice-Abs'], ascending=False)['SalePrice-Abs'][0:20]

SalePrice           1.000000
GrLivArea           0.730620
FirstFlrSF          0.609835
ExterQual_TA        0.595146
YearBuilt           0.584085
FullBath_1          0.573606
YearRemodAdd        0.565244
KitchenQual_TA      0.537961
Foundation_PConc    0.529806
GarageCars_3        0.526388
ExterQual_Gd        0.511701
Fireplaces_0        0.511254
FireplaceQu_None    0.511254
FullBath_2          0.505694
GarageYrBlt         0.497614
BsmtQual_TA         0.477198
HeatingQC_Ex        0.466062
BsmtQual_Ex         0.462085
OpenPorchSF         0.459409
GarageArea          0.454535
Name: SalePrice-Abs, dtype: float64

In [8]:
redundant_features = [ x[0] for x in sorted_r2_results if x[1] > 0.6]

In [9]:
all_features_r2 = mean_r2_for_feature(housing_one_hot_df, 'SalePrice')
top8_redundant_features_r2 = mean_r2_for_feature(housing_one_hot_df.drop(redundant_features, axis=1), 'SalePrice')
top6_redundant_features_r2 = mean_r2_for_feature(housing_one_hot_df.drop(redundant_features[-6:], axis=1), 'SalePrice')
top3_redundant_features_r2 = mean_r2_for_feature(housing_one_hot_df.drop(redundant_features[-3:], axis=1), 'SalePrice')
top2_redundant_features_r2 = mean_r2_for_feature(housing_one_hot_df.drop(redundant_features[-2:], axis=1), 'SalePrice')
top1_redundant_features_r2 = mean_r2_for_feature(housing_one_hot_df.drop(redundant_features[-1], axis=1), 'SalePrice')
print("Original R2: {}".format(all_features_r2))
print("Top 8 Redundant R2: {}".format(top8_redundant_features_r2))
print("Top 6 Redundant R2: {}".format(top6_redundant_features_r2))
print("Top 3 Redundant R2: {}".format(top3_redundant_features_r2))
print("Top 2 Redundant R2: {}".format(top2_redundant_features_r2))
print("Top 1 Redundant R2: {}".format(top1_redundant_features_r2))

Original R2: 0.7947302802605359
Top 8 Redundant R2: 0.7870302060023142
Top 6 Redundant R2: 0.7805990947351924
Top 3 Redundant R2: 0.8073791343292573
Top 2 Redundant R2: 0.8039710376550513
Top 1 Redundant R2: 0.7957698365025647


### Top correlations to SalePrice that are not redundant

In [10]:
cols = [c for c in housing_one_hot_df.columns if c not in redundant_features[-6:]]
no_redundant_features_df = housing_one_hot_df[cols]

corr = no_redundant_features_df.corr()
corr['SalePrice-Abs'] = abs(corr['SalePrice'])
corr.sort_values(by=['SalePrice-Abs'], ascending=False)['SalePrice-Abs'][0:11]


SalePrice           1.000000
FirstFlrSF          0.609835
ExterQual_TA        0.595146
FullBath_1          0.573606
YearRemodAdd        0.565244
KitchenQual_TA      0.537961
Foundation_PConc    0.529806
GarageCars_3        0.526388
ExterQual_Gd        0.511701
FireplaceQu_None    0.511254
Fireplaces_0        0.511254
Name: SalePrice-Abs, dtype: float64