In [1]:
#import libaries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import operator
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [2]:
#import data
housing = pd.read_csv('/Users/samuelclark/Documents/UMBC Data Science /Machine learning/Homework/housing_processed.csv')
housing.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,GarageType_NA,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,1,60,65.0,8450,7,5,2003,2003,196.0,Gd,...,0,0,0,0,0,0,0,0,0,1
1,2,20,80.0,9600,6,8,1976,1976,0.0,TA,...,0,0,0,0,0,0,0,0,0,1
2,3,60,68.0,11250,7,5,2001,2002,162.0,Gd,...,0,0,0,0,0,0,0,0,0,1
3,4,70,60.0,9550,7,5,1915,1970,0.0,TA,...,0,0,0,0,0,0,0,0,0,1
4,5,60,84.0,14260,8,5,2000,2000,350.0,Gd,...,0,0,0,0,0,0,0,0,0,1


In [3]:
#data exploration
housing.dtypes[housing.dtypes==object]

ExterQual       object
ExterCond       object
BsmtQual        object
BsmtCond        object
BsmtExposure    object
BsmtFinType1    object
BsmtFinType2    object
HeatingQC       object
CentralAir      object
KitchenQual     object
Functional      object
FireplaceQu     object
GarageFinish    object
GarageQual      object
GarageCond      object
PavedDrive      object
dtype: object

In [4]:
#dropping non numeric columns
housing_ml = housing.loc[:,housing.dtypes != object]

In [5]:
# Split data as features and target
# take "SalePrice" values into its own 1-D array 
sale_price = housing_ml.pop('SalePrice')

#converting to np arrays from pandas DF
y = sale_price.to_numpy(dtype = int)
X = housing_ml.to_numpy(dtype = int)

In [6]:
#text train split
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = .8, random_state = 21)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
print(x_train.ndim, y_train.ndim, x_test.ndim, y_test.ndim)


(1168, 221) (1168,) (292, 221) (292,)
2 1 2 1


In [7]:
#model with all features
model_all = LinearRegression().fit(x_train, y_train)
y_pred = model_all.predict(x_test)
rmse_all = mean_squared_error(y_test, y_pred, squared = False)
rmse_all

35623.30345000818

In [8]:
#5NN regression all features
model_5nn = KNeighborsRegressor(n_neighbors=5).fit(x_train, y_train)
y_pred = model_5nn.predict(x_test)
rmse_5nn = mean_squared_error(y_test, y_pred, squared = False)
rmse_5nn

51966.07983368139

In [9]:
#10NN regression all features
model_10nn = KNeighborsRegressor(n_neighbors=10).fit(x_train, y_train)
y_pred = model_10nn.predict(x_test)
rmse_10nn = mean_squared_error(y_test, y_pred, squared = False)
rmse_10nn

52392.16317256475

In [10]:
#create model and fit it to each feature (221). Then finding the top 10 performing by RSME
rmse = []

feature_list = list(housing_ml.columns)

featureDict = {}

for i in range(221):

    x_tr = x_train[:,i].reshape(-1,1)
    x_tst = x_test[:,i].reshape(-1,1)
    
    model = LinearRegression()
    model.fit(x_tr, y_train)
    y_pred = model.predict(x_tst)
    
    score = model.score(x_tst, y_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    index = i
    
    featureDict[i] = [feature_list[i], score, rmse, index]

#sorting dict
results_sorted = sorted(featureDict.values(), key=operator.itemgetter(1), reverse=True)[:10]
results_sorted

[['OverallQual', 0.6454631197278844, 49018.43578812499, 4],
 ['ExterQual_Coded', 0.4991753471267155, 58260.148655350655, 36],
 ['GrLivArea', 0.444289666781406, 61369.55515927838, 16],
 ['KitchenQual_Coded', 0.43970293723342646, 61622.30113827634, 44],
 ['TotalBsmtSF', 0.42491266111423953, 62430.33167581401, 12],
 ['1stFlrSF', 0.4166679048480356, 62876.2562386846, 13],
 ['GarageCars', 0.41193116659665807, 63131.022100901646, 25],
 ['GarageArea', 0.4073263621599067, 63377.71018105261, 26],
 ['BsmtQual_Coded', 0.340970014826262, 66831.5181312807, 38],
 ['GarageFinish_Coded', 0.3162849544224505, 68071.65468394081, 47]]

In [11]:
#grabing the index of the top 10
index_top_10 = []
for indx in results_sorted:
    index_top_10.append(indx[3])
print(index_top_10)

[4, 36, 16, 44, 12, 13, 25, 26, 38, 47]


In [12]:
#combinations of the best 10
pairs_of_two = list(combinations(index_top_10, 2))
print(pairs_of_two)

[(4, 36), (4, 16), (4, 44), (4, 12), (4, 13), (4, 25), (4, 26), (4, 38), (4, 47), (36, 16), (36, 44), (36, 12), (36, 13), (36, 25), (36, 26), (36, 38), (36, 47), (16, 44), (16, 12), (16, 13), (16, 25), (16, 26), (16, 38), (16, 47), (44, 12), (44, 13), (44, 25), (44, 26), (44, 38), (44, 47), (12, 13), (12, 25), (12, 26), (12, 38), (12, 47), (13, 25), (13, 26), (13, 38), (13, 47), (25, 26), (25, 38), (25, 47), (26, 38), (26, 47), (38, 47)]


In [16]:
#regression models on the pair combinations 
feature_list = housing_ml.columns

pairsFeatureDict = {}

for two in pairs_of_two:
    
    i = list(two)
        
    x_tr = x_train[:,i]
    x_tst = x_test[:,i]
    
    
    model = LinearRegression()
    model.fit(x_tr, y_train)
    y_pred = model.predict(x_tst)
    
    score = model.score(x_tst, y_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    indx = "+".join(list(feature_list[i]))
    
    pairsFeatureDict[indx] = [indx, score, rmse, i]

In [17]:
print(pairsFeatureDict)

{'OverallQual+ExterQual_Coded': ['OverallQual+ExterQual_Coded', 0.6680147343521529, 47433.82463056837, [4, 36]], 'OverallQual+GrLivArea': ['OverallQual+GrLivArea', 0.6921986779040228, 45673.46700931651, [4, 16]], 'OverallQual+KitchenQual_Coded': ['OverallQual+KitchenQual_Coded', 0.6730869536074611, 47070.072304423644, [4, 44]], 'OverallQual+TotalBsmtSF': ['OverallQual+TotalBsmtSF', 0.706791108152437, 44577.66409611639, [4, 12]], 'OverallQual+1stFlrSF': ['OverallQual+1stFlrSF', 0.7173882583039417, 43764.68831784155, [4, 13]], 'OverallQual+GarageCars': ['OverallQual+GarageCars', 0.6723247870365774, 47124.910051666164, [4, 25]], 'OverallQual+GarageArea': ['OverallQual+GarageArea', 0.6813946935673894, 46468.13490082075, [4, 26]], 'OverallQual+BsmtQual_Coded': ['OverallQual+BsmtQual_Coded', 0.6566660974647345, 48237.754631121876, [4, 38]], 'OverallQual+GarageFinish_Coded': ['OverallQual+GarageFinish_Coded', 0.6519020465859777, 48571.27166300009, [4, 47]], 'ExterQual_Coded+GrLivArea': ['Exte