# Getting a Model

Interesting Observations About Dataset
- May be imbalanced data - what percent of houses are sold within 60 days?
- Time series data - there are definitely times of the year where the market is hotter than other times

### Imports

In [2]:
%run src/imports.py
from src.helpers import calculate_five_percent, calculate_ten_percent, calculate_twenty_percent

Set image size

In [3]:
sns.set(rc={'figure.figsize':(16,4)})

**Read in Data**

In [4]:
neighborhoods_cleaned = pd.read_pickle('data/neighborhoods_cleaned.pkl')

In [5]:
neighborhoods_cleaned.reset_index(inplace=True)
neighborhoods_cleaned.drop('index',axis=1,inplace=True)

In [6]:
neighborhoods_cleaned.head()

Unnamed: 0,SOLD DATE,ZIP OR POSTAL CODE,PRICE,LOCATION,SQUARE FEET,LOT SIZE,YEAR BUILT,HOA/MONTH,LATITUDE,LONGITUDE,ROOMS
0,September-25-2020,78257,345000,The Dominion,1989.0,5662.0,2005.0,230.0,29.647801,-98.614186,4.0
1,June-11-2020,78256,337500,Stonewall Ranch,2166.0,9583.0,2004.0,54.0,29.650304,-98.629082,6.0
2,March-11-2021,78256,375500,Stonewall Ranch,2969.0,6605.0,2004.0,54.0,29.651842,-98.63065,6.5
3,October-31-2018,78255,462000,The Ridge @ Sonoma Verde,2716.0,9147.0,2014.0,95.0,29.61765,-98.64343,7.0
4,February-26-2021,78256,624000,Crownridge,3012.0,75794.0,2000.0,25.0,29.620646,-98.622295,5.5


More data cleaning
- change sold date to month and year
- fill incomplete HOA's with zeros
- drop any lines with incomplete information

In [7]:
neighborhoods_cleaned['SOLD DATE'] = pd.to_datetime(neighborhoods_cleaned['SOLD DATE'])
neighborhoods_cleaned['MONTH'] = pd.DatetimeIndex(neighborhoods_cleaned['SOLD DATE']).month
neighborhoods_cleaned['YEAR'] = pd.DatetimeIndex(neighborhoods_cleaned['SOLD DATE']).year
neighborhoods_cleaned.drop(['SOLD DATE'],axis=1,inplace=True)
neighborhoods_cleaned['HOA/MONTH'].fillna(value=0,inplace=True)
neighborhoods_cleaned.dropna(inplace=True)

Create dataframe for flask app

In [8]:
nflask_df = neighborhoods_cleaned.drop('LOCATION',axis=1)

Create dummie variables for neighborhoods

In [9]:
pd.get_dummies(neighborhoods_cleaned['LOCATION'],drop_first=True)

Unnamed: 0,Alamo Heights,Alamo Ranch,Alta Vista,Amhurst,Apple Creek,Arbor At Sonoma Ranch,Arcadia Ridge,Asher Place,Auburn Hills At Woodcrest,Autumn Run,...,Wildhorse,Wildhorse At Tausch Farms,Willow Grove Sub (sc),Wilshire Terrace,Wilshire Village,Windcrest,Wolf Creek,Woodcrest,Woodlake,Woods Of Shavano
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20007,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Merging dummies with original dataframe

In [10]:
neighborhoods_cleaned = neighborhoods_cleaned.merge(pd.get_dummies(neighborhoods_cleaned['LOCATION'],drop_first=True), how='outer',left_index=True,right_index=True)

In [11]:
neighborhoods_cleaned.drop('LOCATION',axis=1,inplace=True)

---

### Taking price outliers out of dataset

In [12]:
lowcost = neighborhoods_cleaned[neighborhoods_cleaned.PRICE < 500000]
lowercost = neighborhoods_cleaned[neighborhoods_cleaned.PRICE < 290000]

In [29]:
pd.set_option('display.max_columns', None)

---

Running initial train-test-split

In [30]:
y = nflask_df.PRICE.values
X = nflask_df.drop('PRICE',axis=1).values
X_train_full, X_holdout, y_train_full, y_holdout = train_test_split(X,y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X_train_full,y_train_full, test_size=0.33)

In [31]:
X_test[0]

array([ 7.82560000e+04,  3.01200000e+03,  7.57940000e+04,  2.00000000e+03,
        2.50000000e+01,  2.96206464e+01, -9.86222951e+01,  5.50000000e+00,
        2.00000000e+00,  2.02100000e+03])

## Models

Random Forest Regressor Model

In [32]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
y_holdout_predict = rfr.predict(X_holdout)

Test score: 0.8807534399383761
Holdout score: 0.8555934605318467
Test MSE: 3.163439e+09
Holdout MSE: 4.234660e+09
Test 5%: 44.924488625501816
Holdout 5%: 44.76406762553621
Test 10%: 67.08086407952591
Holdout 10%: 67.65076961897553
Test 20%: 85.89179889122539
Holdout 10%: 87.13096139288417
0.8807534399383761


In [37]:
print("Test R^2 score:", rfr.score(X_test, y_test))
print('Test MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print("Test 5%:",calculate_five_percent(y_predict,y_test))
print("Test 10%:",calculate_ten_percent(y_predict,y_test))
print("Test 20%:",calculate_twenty_percent(y_predict,y_test))
#print("Test R^2 score:",r2_score(y_test,y_predict))

Test R^2 score: 0.8807534399383761
Test MSE: 3.163439e+09
Test 5%: 44.924488625501816
Test 10%: 67.08086407952591
Test 20%: 85.89179889122539


In [36]:
print("Holdout R^2 score:", rfr.score(X_holdout, y_holdout))
print('Holdout MSE: {:2e}'.format(mean_squared_error(y_holdout, y_holdout_predict)))
print("Holdout 5%:",calculate_five_percent(y_holdout_predict,y_holdout))
print("Holdout 10%:",calculate_ten_percent(y_holdout_predict,y_holdout))
print("Holdout 20%:",calculate_twenty_percent(y_holdout_predict,y_holdout))

Holdout R^2 score: 0.8555934605318467
Holdout MSE: 4.234660e+09
Holdout 5%: 44.76406762553621
Holdout 10%: 67.65076961897553
Holdout 20%: 87.13096139288417


In [22]:
# pickle the model
with open('data/model_rfr_full.pkl', 'wb') as f:
    pickle.dump(rfr, f)

Random Forest Regressor Model using standard scaler, pipeline and gridsearchCV

In [38]:
X = nflask_df.drop('PRICE',axis=1).values
y = nflask_df.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [None]:
kf = KFold()
rfr_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', RandomForestRegressor())])
rfr_grid_params = {'regressor__n_estimators': np.arange(50,250,50),
                  'regressor__max_depth': np.arange(1,21,5)}
rfr_grid = GridSearchCV(rfr_pipe, rfr_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
rfr_grid.fit(X_train, y_train)
y_predict = rfr_grid.predict(X_test)

In [43]:
print("rfr_pipe Negative MSE: {:.2e}".format(-1 * rfr_grid.score(X_test, y_test)))
print("Within 5%", calculate_five_percent(y_predict,y_test))
print("Within 10%", calculate_ten_percent(y_predict,y_test))
print("Within 20%", calculate_twenty_percent(y_predict,y_test))
print("R^2 score", r2_score(y_test,y_predict))

rfr_pipe Negative MSE: 2.38e+09
Within 5% 38.70804945748171
Within 10% 63.89099167297502
Within 20% 86.32349230381024
R^2 score 0.9041970039777706


In [44]:
rfr_grid.best_params_

{'regressor__max_depth': 11, 'regressor__n_estimators': 150}

-----

Taking just three features of dataset

In [26]:
threefeatures = neighborhoods_cleaned.drop(['ZIP OR POSTAL CODE','LATITUDE','LONGITUDE','HOA/MONTH','ROOMS','LOT SIZE','MONTH','YEAR'],axis=1)

In [27]:
threefeatures

Unnamed: 0,PRICE,SQUARE FEET,YEAR BUILT,Alamo Heights,Alamo Ranch,Alta Vista,Amhurst,Apple Creek,Arbor At Sonoma Ranch,Arcadia Ridge,Asher Place,Auburn Hills At Woodcrest,Autumn Run,Beacon Hill,Bella Vista,Bellaire,Blue Rock Springs,Braun Ridge,Bricewood,Bridgewood,Bulverde Village,Camelot,Camelot Ii,Candlewood Park,Canyons At Amhurst,Champion Springs,Champions Park,Cibolo Canyons,Cimarron,Copper Canyon,Copperfield,Cross Creek,Crownridge,Cupples/zarzamora,Deerfield,Dellcrest,Denver Heights,Dignowity,Donaldson Terrace,Durango/roosevelt,East Terrell Hills,East Village,Eastwood Village,Elm Creek,Encino Park,Escondido North,Escondido/parc At,Foster Meadows,Fox Grove,Fronterra At Westpointe,Fronterra At Westpointe - Bexa,Government Hill,Grandview,Hallies Ranch,Hanover Cove,Harlandale,Heather Place,Heritage Park,Hidden Trails,Highland Farms,Highland Hills,Highland Park,Hill Country Retreat,Hillcrest,Hot Wells,Hunters Chase,Hunters Creek,Huntington Place,I35 So. To E. Houston (sa),Indian Springs,Jefferson Terrace,Johnson Ranch - Comal,Kinder Ranch,King William,Knox Ridge,Lackland Terrace,Ladera,Lakeside,Las Lomas,Laura Heights,Laurel Canyon,Laurel Mountain Ranch,Lavaca,Live Oak Village,Longs Creek,Los Angeles Heights,Macarthur Park,Mahncke Park,Meadow Oaks,Meadow Park,Meadows At Bridgewood,Millers Point,Miramar Unit 1,Mission Creek,Mission Del Lago,Monte Viejo,Monte Vista,Monterrey Village,Monticello Heights,Mustang Valley,N/a,Near Eastside,Northampton,Northcrest Hills,Northeast Crossing,Northwood,Not In Defined Subdivision,Oakland Heights,Oaks At Sonterra,Olmos Park,Olympia,Palm Heights,Park Village,Parkwood,Pasadena Heights,Pecan Valley,Potranco Ranch Medina County,Preserve At Medina,Presidio,Redbird Ranch,Republic Creek,Republic Meadows,Republic Oaks,Rhine Valley,Ridgeview,Riposa Vita,River Bend,Rogers Ranch,Rolling Creek,Rosillo Creek,Royal Ridge,S Durango/probandt,Shaenfield Place,Shavano Park,Shavano Ridge,Sierra Springs,Silver Canyon,Silver Creek,Silver Oaks,Sonoma Ranch,Spring Vistas,Springwood,Steubing Farm Ut-7 (enclave) B,Steubing Ranch,Stillwater Ranch,Stonewall Ranch,Stoney Creek,Summerhill,Sunrise,Tanglewood,Terrell Heights,Terrell Hills,Texas Research Park,The Dominion,The Glen,The Park At University Hills,The Preserve At Indian Springs,The Ridge @ Sonoma Verde,The Ridge At Salado Creek,The Villages At Stone Oak,The Vineyard,Tobin Hill,Valley Forge,Valley Ranch - Bexar County,Ventura,Westcreek,Westwinds-summit At Alamo Ranc,Wheatley Heights,Wildhorse,Wildhorse At Tausch Farms,Willow Grove Sub (sc),Wilshire Terrace,Wilshire Village,Windcrest,Wolf Creek,Woodcrest,Woodlake,Woods Of Shavano
0,345000,1989.0,2005.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,337500,2166.0,2004.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,375500,2969.0,2004.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,462000,2716.0,2014.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,624000,3012.0,2000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20007,153000,1577.0,1983.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20008,189900,1703.0,2018.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20009,128000,1102.0,1985.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20010,196499,2324.0,2014.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Running train-test-split on three features

In [28]:
X = threefeatures.drop('PRICE',axis=1).values
y = threefeatures.PRICE.values
X_train_full, X_holdout, y_train_full, y_holdout = train_test_split(X,y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X_train_full,y_train_full, test_size=0.33)

Regressor model with three features

In [20]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
y_holdout_predict = rfr.predict(X_holdout)

score: 0.8869095280897749
MSE: 3.171610e+09
43.387845750050474
66.46476882697355
85.66525338178882
0.8869095280897749


In [None]:
print("Test R^2 score:", rfr.score(X_test, y_test))
print('Test MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print("Test 5%:",calculate_five_percent(y_predict,y_test))
print("Test 10%:",calculate_ten_percent(y_predict,y_test))
print("Test 20%:",calculate_twenty_percent(y_predict,y_test))
#print("Test R^2 score:",r2_score(y_test,y_predict))

In [None]:
print("Holdout R^2 score:", rfr.score(X_holdout, y_holdout))
print('Holdout MSE: {:2e}'.format(mean_squared_error(y_holdout, y_holdout_predict)))
print("Holdout 5%:",calculate_five_percent(y_holdout_predict,y_holdout))
print("Holdout 10%:",calculate_ten_percent(y_holdout_predict,y_holdout))
print("Holdout 20%:",calculate_twenty_percent(y_holdout_predict,y_holdout))

In [None]:
X = threefeatures.drop('PRICE',axis=1).values
y = threefeatures.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [48]:
kf = KFold()
rfr_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', RandomForestRegressor(warm_start=True))])
rfr_grid_params = {'regressor__n_estimators': np.arange(50,250,10),
                  'regressor__max_depth': np.arange(1,15,3)}
rfr_grid = GridSearchCV(rfr_pipe, rfr_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
rfr_grid.fit(X_train, y_train)
y_predict = rfr_grid.predict(X_test)
print("rfrAccuracy: {:.2e}".format(rfr_grid.score(X_test, y_test)))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

rfrAccuracy: -4.27e+09
MSE: 4.268909e+09
23.864324651726225
44.3165758126388
73.3898647284474
0.8099481008471533


----

Pulling only 5 most correlated features

In [45]:
fivefeatures = neighborhoods_cleaned.drop(['ZIP OR POSTAL CODE','LATITUDE','LONGITUDE','HOA/MONTH','ROOMS','LOT SIZE'],axis=1)
fivefeatures

Unnamed: 0,PRICE,SQUARE FEET,YEAR BUILT,MONTH,YEAR,Alamo Heights,Alamo Ranch,Alta Vista,Amhurst,Apple Creek,...,Wildhorse,Wildhorse At Tausch Farms,Willow Grove Sub (sc),Wilshire Terrace,Wilshire Village,Windcrest,Wolf Creek,Woodcrest,Woodlake,Woods Of Shavano
0,345000,1989.0,2005.0,9,2020,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,337500,2166.0,2004.0,6,2020,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,375500,2969.0,2004.0,3,2021,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,462000,2716.0,2014.0,10,2018,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,624000,3012.0,2000.0,2,2021,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20007,153000,1577.0,1983.0,9,2019,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20008,189900,1703.0,2018.0,7,2018,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20009,128000,1102.0,1985.0,3,2019,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20010,196499,2324.0,2014.0,3,2019,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
X = fivefeatures.drop('PRICE',axis=1).values
y = fivefeatures.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [47]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

score: 0.8569788933330466
MSE: 3.212512e+09
44.33676559660812
66.48495861094287
85.1201292146174
0.8569788933330466


In [49]:
seven = neighborhoods_cleaned.drop(['ZIP OR POSTAL CODE','LATITUDE','LONGITUDE','HOA/MONTH'],axis=1)
seven

Unnamed: 0,PRICE,SQUARE FEET,LOT SIZE,YEAR BUILT,ROOMS,MONTH,YEAR,Alamo Heights,Alamo Ranch,Alta Vista,...,Wildhorse,Wildhorse At Tausch Farms,Willow Grove Sub (sc),Wilshire Terrace,Wilshire Village,Windcrest,Wolf Creek,Woodcrest,Woodlake,Woods Of Shavano
0,345000,1989.0,5662.0,2005.0,4.0,9,2020,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,337500,2166.0,9583.0,2004.0,6.0,6,2020,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,375500,2969.0,6605.0,2004.0,6.5,3,2021,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,462000,2716.0,9147.0,2014.0,7.0,10,2018,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,624000,3012.0,75794.0,2000.0,5.5,2,2021,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20007,153000,1577.0,6969.0,1983.0,5.0,9,2019,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20008,189900,1703.0,4791.0,2018.0,6.0,7,2018,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20009,128000,1102.0,5662.0,1985.0,5.0,3,2019,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20010,196499,2324.0,5662.0,2014.0,7.0,3,2019,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
X = seven.drop('PRICE',axis=1).values
y = seven.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y)
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

score: 0.8724925813308787
MSE: 2.884614e+09
45.124167171411266
66.76761558651323
85.76620230163537
0.8724925813308787


___

## Lower Cost Modeling

In [21]:
X = lowcost.drop('PRICE',axis=1).values
y = lowcost.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [22]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

score: 0.9013599424219896
MSE: 7.767340e+08
51.81567211722234
73.45508600552134
90.18899978764068
0.9013599424219896


In [62]:
X = lowercost.drop('PRICE',axis=1).values
y = lowercost.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [63]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

score: 0.8279965794206802
MSE: 4.747814e+08
52.69720791542424
74.38330170777988
89.45513689346707
0.8279965794206802


In [64]:
kf = KFold()
gbr_pipe = Pipeline([('scaler', StandardScaler()), ('regressor', GradientBoostingRegressor(loss='huber'))])
gbr_grid_params = {'regressor__alpha': np.arange(0.1,0.9,0.2),
                  'regressor__max_depth': np.arange(1,9,2),
                  'regressor__learning_rate': np.arange(0.01,0.2,0.03)}
gbr_grid = GridSearchCV(gbr_pipe, gbr_grid_params, cv=kf, scoring = 'neg_mean_squared_error')
gbr_grid.fit(X_train, y_train)
y_predict = gbr_grid.predict(X_test)
print("gbr Accuracy: {:.2e}".format(gbr_grid.score(X_test, y_test)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

gbr Accuracy: -5.26e+08
51.91108701545134
74.27487123881811
88.58769314177283
0.8095340902641923


---

## Treating Zip, Month, Year as categorical variables

In [66]:
categories = neighborhoods_cleaned.copy()

In [67]:
categories['ZIP OR POSTAL CODE'] = categories['ZIP OR POSTAL CODE'].astype('category')
categories['MONTH'] = categories['MONTH'].astype('category')
categories['YEAR'] = categories['YEAR'].astype('category')

In [70]:
categories.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19812 entries, 0 to 20011
Columns: 186 entries, ZIP OR POSTAL CODE to Woods Of Shavano
dtypes: category(3), float64(7), int64(1), uint8(175)
memory usage: 5.4 MB


In [72]:
c_lowcost = categories[categories.PRICE < 500000]
c_lowercost = categories[categories.PRICE < 290000]

In [73]:
c_lowcost = c_lowcost.merge(pd.get_dummies(c_lowcost['ZIP OR POSTAL CODE'],drop_first=True), how='outer',left_index=True,right_index=True)
c_lowcost.drop('ZIP OR POSTAL CODE',axis=1,inplace=True)
c_lowcost = c_lowcost.merge(pd.get_dummies(c_lowcost['MONTH'],drop_first=True), how='outer',left_index=True,right_index=True)
c_lowcost.drop('MONTH',axis=1,inplace=True)
c_lowcost = c_lowcost.merge(pd.get_dummies(c_lowcost['YEAR'],drop_first=True), how='outer',left_index=True,right_index=True)
c_lowcost.drop('YEAR',axis=1,inplace=True)

In [74]:
X = c_lowcost.drop('PRICE',axis=1).values
y = c_lowcost.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [75]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

score: 0.9000324594477321
MSE: 7.874356e+08
48.970057337014225
73.37014228073902
89.38203440220853
0.9000324594477321


In [76]:
c_lowercost = c_lowercost.merge(pd.get_dummies(c_lowercost['ZIP OR POSTAL CODE'],drop_first=True), how='outer',left_index=True,right_index=True)
c_lowercost.drop('ZIP OR POSTAL CODE',axis=1,inplace=True)
c_lowercost = c_lowercost.merge(pd.get_dummies(c_lowercost['MONTH'],drop_first=True), how='outer',left_index=True,right_index=True)
c_lowercost.drop('MONTH',axis=1,inplace=True)
c_lowercost = c_lowercost.merge(pd.get_dummies(c_lowercost['YEAR'],drop_first=True), how='outer',left_index=True,right_index=True)
c_lowercost.drop('YEAR',axis=1,inplace=True)

In [77]:
X = c_lowercost.drop('PRICE',axis=1).values
y = c_lowercost.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [78]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

score: 0.8343211139986686
MSE: 4.569436e+08
52.832746001626454
74.95256166982922
89.69910544863107
0.8343211139986686


---

In [80]:
three_lowcost = threefeatures[threefeatures.PRICE < 500000]
three_lowercost = threefeatures[threefeatures.PRICE < 290000]

In [81]:
X = three_lowercost.drop('PRICE',axis=1).values
y = three_lowercost.PRICE.values
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [82]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

score: 0.7715892730131111
MSE: 6.115510e+08
46.110056925996204
69.07020872865274
88.0726484142044
0.7715892730131111


---

In [90]:
modeldata = pd.read_pickle('data/modeldata.pkl')

In [92]:
lowmodel = modeldata[modeldata.PRICE < 500000]

In [98]:
predictors = lowmodel.drop(['ZIP OR POSTAL CODE','HOA/MONTH','LATITUDE','LONGITUDE','ROOMS','MONTH','YEAR','PRICE'],axis=1).values

In [99]:
result = lowmodel.PRICE.values

In [100]:
X_train, X_test, y_train, y_test = train_test_split(predictors,result)

In [101]:
rfr = RandomForestRegressor(oob_score=True)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
print("score:", rfr.score(X_test, y_test))
print('MSE: {:2e}'.format(mean_squared_error(y_test, y_predict)))
print(calculate_five_percent(y_predict,y_test))
print(calculate_ten_percent(y_predict,y_test))
print(calculate_twenty_percent(y_predict,y_test))
print(r2_score(y_test,y_predict))

score: 0.7123236420750624
MSE: 2.223442e+09
32.321087279677215
53.00488426417499
77.10766617116161
0.7123236420750624
