In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
def to_numeric(df, col, target="", test = False):

    group = df.groupby(col)
    group = group.aggregate({ target :'median'})
    group = group.sort_values(target)

    names = list(group.index)
    #print(names)
    id_dict = {}
    i = 0
    for name in names:
        #print(name)
        id_dict.update({name: i})
        i += 1
    print("Encoded values for: ", col)
    print(id_dict)
    #print(df[col].unique())
    df[col] = [id_dict[x] for x in df[col]]

    return df, id_dict

In [3]:
def to_numeric_test(df, col, id_dict):
    
    dict_keys = list(id_dict.keys())
    
    names = list(df[col].unique())
    #print(col, "*"*50)
    #print(dict_keys)
    #print(names)
    missing_values = [x for x in names if x not in dict_keys]
    
    #print(missing_values)
    i = len(dict_keys) // 2
    for name in missing_values:
        #print(name)
        id_dict.update({name: i})
    #print("Encoded values for: ", col)
    #print(id_dict)
    #print(df[col].unique())
    df[col] = [id_dict[x] for x in df[col]]

    return df, id_dict

In [4]:
housing = pd.read_csv('../data/clean_train.csv')

In [5]:
housing.drop('Id', axis=1, inplace=True)

In [7]:
housing.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,Othr,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IRG,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,Othr,RL,60.0,9550,Pave,,IRG,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IRG,Lvl,AllPub,Othr,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
needs_numeric = housing.loc[:, housing.dtypes == "object"]
needs_numeric_names = needs_numeric.columns

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,60,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,20,RL,Pave,,Reg,Lvl,AllPub,Othr,Gtl,Othr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,60,RL,Pave,,IRG,Lvl,AllPub,Inside,Gtl,CollgCr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,Othr,RL,Pave,,IRG,Lvl,AllPub,Corner,Gtl,Othr,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,60,RL,Pave,,IRG,Lvl,AllPub,Othr,Gtl,Othr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [10]:
colname = list(needs_numeric_names)
#print(colname)
dict_dictonary = {}

for col in colname:
    housing.col, id_dictonary = to_numeric(housing, col, 'SalePrice')
    dict_dictonary.update({col : id_dictonary})
#print(dict_dictonary)
housing.head()

Encoded values for:  MSSubClass
{'50': 0, 'Othr': 1, '20': 2, '120': 3, '60': 4}
Encoded values for:  MSZoning
{'RM': 0, 'RL': 1, 'Othr': 2}
Encoded values for:  Street
{'Othr': 0, 'Pave': 1}
Encoded values for:  Alley
{'Othr': 0, 'None': 1}
Encoded values for:  LotShape
{'Reg': 0, 'IRG': 1}
Encoded values for:  LandContour
{'Lvl': 0, 'Othr': 1}
Encoded values for:  Utilities
{'Othr': 0, 'AllPub': 1}
Encoded values for:  LotConfig
{'Inside': 0, 'Corner': 1, 'Othr': 2, 'CulDSac': 3}
Encoded values for:  LandSlope
{'Gtl': 0, 'Othr': 1}
Encoded values for:  Neighborhood
{'OldTown': 0, 'Edwards': 1, 'Sawyer': 2, 'NAmes': 3, 'Othr': 4, 'Gilbert': 5, 'CollgCr': 6, 'Somerst': 7, 'NridgHt': 8}
Encoded values for:  Condition1
{'Feedr': 0, 'Othr': 1, 'Norm': 2}
Encoded values for:  Condition2
{'Othr': 0, 'Norm': 1}
Encoded values for:  BldgType
{'Othr': 0, '1Fam': 1, 'TwnhsE': 2}
Encoded values for:  HouseStyle
{'1.5Fin': 0, 'Othr': 1, '1Story': 2, '2Story': 3}
Encoded values for:  OverallQual
{

  


Encoded values for:  RoofStyle
{'Gable': 0, 'Hip': 1, 'Othr': 2}
Encoded values for:  RoofMatl
{'CompShg': 0, 'Othr': 1}
Encoded values for:  Exterior1st
{'Wd Sdng': 0, 'MetalSd': 1, 'HdBoard': 2, 'Othr': 3, 'Plywood': 4, 'VinylSd': 5}
Encoded values for:  Exterior2nd
{'Wd Sdng': 0, 'MetalSd': 1, 'Othr': 2, 'HdBoard': 3, 'Plywood': 4, 'VinylSd': 5}
Encoded values for:  MasVnrType
{'None': 0, 'BrkFace': 1, 'Stone': 2}
Encoded values for:  ExterQual
{'TA': 0, 'Gd': 1, 'Othr': 2}
Encoded values for:  ExterCond
{'Othr': 0, 'Gd': 1, 'TA': 2}
Encoded values for:  Foundation
{'Othr': 0, 'BrkTil': 1, 'CBlock': 2, 'PConc': 3}
Encoded values for:  BsmtQual
{'Othr': 0, 'TA': 1, 'Gd': 2, 'Ex': 3}
Encoded values for:  BsmtCond
{'Othr': 0, 'TA': 1}
Encoded values for:  BsmtExposure
{'Othr': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}
Encoded values for:  BsmtFinType1
{'Othr': 0, 'LwQ': 1, 'BLQ': 2, 'Rec': 3, 'ALQ': 4, 'Unf': 5, 'GLQ': 6}
Encoded values for:  BsmtFinType2
{'Othr': 0, 'Unf': 1}
Encoded val

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,4,1,65.0,8450,1,1,0,0,1,0,...,0,0,2,1,0,2,2008,1,2,208500
1,2,1,80.0,9600,1,1,0,0,1,2,...,0,0,2,1,0,5,2007,1,2,181500
2,4,1,68.0,11250,1,1,1,0,1,0,...,0,0,2,1,0,9,2008,1,2,223500
3,1,1,60.0,9550,1,1,1,0,1,1,...,0,0,2,1,0,2,2006,1,0,140000
4,4,1,84.0,14260,1,1,1,0,1,2,...,0,0,2,1,0,12,2008,1,2,250000


In [8]:
housing.columns = housing.columns.str.lower()
# housing.dtypes

In [9]:
housing_features = housing.drop(['saleprice'], axis=1)
# housing_features.neighborhood = [str(x) for x in housing_features.neighborhood]
# housing_features.mssubclass = [str(x) for x in housing_features.mssubclass]
# housing_features = pd.get_dummies(housing_features, prefix=['nbhood', 'msclass'], 
#                                   columns = ['neighborhood', 'mssubclass'], 
#                                   drop_first = True)
feat_labels = housing_features.columns
print(feat_labels)

prices = [np.log(x) for x in housing.saleprice]

Index(['mssubclass', 'mszoning', 'lotfrontage', 'lotarea', 'street', 'alley',
       'lotshape', 'landcontour', 'utilities', 'lotconfig', 'landslope',
       'neighborhood', 'condition1', 'condition2', 'bldgtype', 'housestyle',
       'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd', 'roofstyle',
       'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype', 'masvnrarea',
       'exterqual', 'extercond', 'foundation', 'bsmtqual', 'bsmtcond',
       'bsmtexposure', 'bsmtfintype1', 'bsmtfinsf1', 'bsmtfintype2',
       'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'heating', 'heatingqc',
       'centralair', 'electrical', 'x1stflrsf', 'x2ndflrsf', 'lowqualfinsf',
       'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath', 'halfbath',
       'bedroomabvgr', 'kitchenabvgr', 'kitchenqual', 'totrmsabvgrd',
       'functional', 'fireplaces', 'fireplacequ', 'garagetype', 'garageyrblt',
       'garagefinish', 'garagecars', 'garagearea', 'garagequal', 'garagecond',
       'paveddrive', '

In [10]:
# Split into training and test set
features_train, features_test, price_train, price_test = train_test_split(housing_features, prices, test_size = 0.33)

In [11]:
clf = RandomForestRegressor(n_estimators=10000, random_state=0, n_jobs=-1, min_samples_split = 10)

sfm = SelectFromModel(clf, threshold = 0.005)

sfm.fit(features_train, price_train)

SelectFromModel(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=0.005)

In [12]:
feature_selected = []
for feature_list_index in sfm.get_support(indices=True):
    feature_selected.append(feat_labels[feature_list_index])
    print(feature_selected)

['lotfrontage']
['lotfrontage', 'lotarea']
['lotfrontage', 'lotarea', 'neighborhood']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'overallcond']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'overallcond', 'yearbuilt']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'overallcond', 'yearbuilt', 'bsmtfinsf1']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'overallcond', 'yearbuilt', 'bsmtfinsf1', 'totalbsmtsf']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'overallcond', 'yearbuilt', 'bsmtfinsf1', 'totalbsmtsf', 'x1stflrsf']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'overallcond', 'yearbuilt', 'bsmtfinsf1', 'totalbsmtsf', 'x1stflrsf', 'grlivarea']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'overallcond', 'yearbuilt', 'bsmtfinsf1', 'totalbsmtsf', 'x1stflrsf', 'grlivarea', 'fullbath']
['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'o

In [13]:
proxy = feature_selected

In [14]:
trimmed = ['x1stflrsf', 'x2ndflrsf', 'garagecars', 'overallcond', 'saleprice', 'Unnamed: 0', 'bsmtfinsf1']

# proxy.append("neighborhood")
# proxy.append("mssubclass")

housing_features = housing.drop('saleprice', axis=1)
housing_features = housing_features[proxy]

trimmed_features = [x for x in housing_features.columns if (x not in trimmed)]

print(trimmed_features)

housing_features = housing_features[trimmed_features]

# housing_features.neighborhood = [str(x) for x in housing_features.neighborhood]
# housing_features.mssubclass = [str(x) for x in housing_features.mssubclass]
# housing_features = pd.get_dummies(housing_features, prefix=['nbhood', 'msclass'], 
#                                   columns = ['neighborhood', 'mssubclass'], 
#                                   drop_first = True)

# housing_features['mssubclass_150'] = 0

feat_labels = housing_features.columns
print(feat_labels)

prices = [np.log(x) for x in housing.saleprice]

['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'yearbuilt', 'totalbsmtsf', 'grlivarea', 'fullbath', 'garagetype', 'garagearea']
Index(['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'yearbuilt',
       'totalbsmtsf', 'grlivarea', 'fullbath', 'garagetype', 'garagearea'],
      dtype='object')


In [15]:
features_train, features_test, price_train, price_test = train_test_split(housing_features, prices, test_size = 0.33)

In [16]:
linear = LinearRegression()

print(np.mean(cross_val_score(linear, features_train, price_train, cv=5)))

linear.fit(housing_features, prices)

price_pred = linear.predict(features_test)


print(mean_squared_error(price_test, price_pred))

0.8573443405471034
0.03163316248776302


In [17]:
housing_test = pd.read_csv('../data/clean_test.csv')

In [18]:
needs_numeric2 = housing_test.loc[:, housing_test.dtypes == "object"]
needs_numeric_names2 = needs_numeric2.columns

In [19]:
colname2 = list(needs_numeric_names2)
print(colname2)
dict_dictonary2 = {}

for col in colname2:
    housing_test.col, id_dictonary = to_numeric_test(housing_test, col, dict_dictonary[col])
    dict_dictonary2.update({col : id_dictonary})
# print(dict_dictonary)
# housing_test.head()
print("")

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]



  


In [21]:
housing_test.columns = housing_test.columns.str.lower()

In [22]:
trimmed = ['x1stflrsf', 'x2ndflrsf', 'garagecars', 'overallcond', 'saleprice', 'Unnamed: 0', 'bsmtfinsf1']
trimmed_features = [x for x in feature_selected if (x not in trimmed)]

housing_featuresT = housing_test.drop(['id'], axis=1)
housing_featuresT = housing_featuresT[housing_features.columns]
# housing_features['neighborhood'] = [str(x) for x in housing_test.neighborhood]
# housing_features['mssubclass'] = [str(x) for x in housing_test.mssubclass]
# housing_features = pd.get_dummies(housing_features, prefix=['nbhood', 'msclass'], 
#                                   columns = ['neighborhood', 'mssubclass'], 
#                                   drop_first = True)
feat_labels = housing_features.columns
print(feat_labels)

Index(['lotfrontage', 'lotarea', 'neighborhood', 'overallqual', 'yearbuilt',
       'totalbsmtsf', 'grlivarea', 'fullbath', 'garagetype', 'garagearea'],
      dtype='object')


In [29]:
price_pred = linear.predict(housing_featuresT)

In [30]:
results = np.exp(price_pred)

In [31]:
len(results)

1459

In [32]:
submission = pd.DataFrame(columns = ['Id', 'SalePrice'])
submission['Id'] = housing_test.id

In [33]:
len(submission['Id'])

1459

In [34]:
submission['SalePrice'] = results

In [35]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,133638.113734
1,1462,155425.454683
2,1463,164200.932929
3,1464,176230.557655
4,1465,226650.694028


In [36]:
submission.to_csv('submission.csv', index=False)