In [1]:
import psycopg2
import sys, os
import numpy as np
import pandas as pd
import pandas.io.sql as psql
from sqlalchemy import create_engine
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from IPython.display import display
from sklearn.metrics import mean_squared_error
from math import sqrt


In [2]:
url = 'postgresql://{user}:{passwd}@{host}:{port}/{db}'.format(
         user="cse6242", passwd="cse6242", host="localhost", port=5432, db="cse6242")

engine = create_engine(url, pool_size=50)

engine.connect()
connection = engine.connect() 
query = "SELECT county_id, home_type_id, year_month, index_value from county_timeseries"

df = pd.read_sql(query, con=connection)

df.rename(columns = {'year_month':'year'}, inplace = True)
df[['year','month']] = df['year'].str.split('-',expand=True)

cols = df.columns.tolist() 
new_cols = [x for x in cols if x != cols[-2]] + [cols[-2]]
df = df[new_cols]
df.fillna(method="ffill", inplace=True)
df.fillna(method="bfill", inplace=True)
random_state = 100
x_data = df.loc[:, df.columns != "index_value"]
y_data = df.loc[:, "index_value"]


In [3]:
display(df)

Unnamed: 0,county_id,home_type_id,year,month,index_value
0,10369,3,2004,11,45700.0
1,10369,3,2004,12,45700.0
2,10369,3,2005,01,45700.0
3,10369,3,2005,02,45800.0
4,10369,3,2005,03,45900.0
...,...,...,...,...,...
2327623,10369,3,2004,06,46100.0
2327624,10369,3,2004,07,45900.0
2327625,10369,3,2004,08,45800.0
2327626,10369,3,2004,09,45900.0


In [4]:

# df, x_vals, y_vals

random_state = 100

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, shuffle=True, random_state=random_state)

In [5]:
# crime_data_query = """
# SELECT zipcode_id, violent_crime, property_crime, county_id
# from crime_data as c, zipcode as z
# WHERE c.zipcode_id::text = z.zip_code

# """

crime_data_query = """
select zipcode_id, violent_crime, property_crime, county_id from crime_data
inner join zipcode on crime_data.zipcode_id = zipcode.id;
"""

crime_data_df = pd.read_sql(crime_data_query, con=connection)

display(crime_data_df)

Unnamed: 0,zipcode_id,violent_crime,property_crime,county_id
0,1,39.6,47.8,19
1,2,49.7,48.7,2
2,3,26.9,33.3,19
3,4,49.2,47.8,2
4,5,65.6,68.4,19
...,...,...,...,...
4765,4766,25.0,33.4,518
4766,4767,17.7,37.6,442
4767,4768,23.5,30.2,553
4768,4769,27.2,36.3,442


In [6]:
# school_data_query = """
# SELECT zipcode_id, schooldigger_rating, average_standard_score, county_id
# from school_data as s, zipcode as z
# WHERE s.zipcode_id::text = z.zip_code

# """

school_data_query = """
select zipcode_id, schooldigger_rating, average_standard_score, county_id
from school_data
inner join zipcode on school_data.zipcode_id = zipcode.id;
"""

school_data_df = pd.read_sql(school_data_query, con=connection)

display(school_data_df)

Unnamed: 0,zipcode_id,schooldigger_rating,average_standard_score,county_id
0,1,1.0,10.4,19
1,1,5.0,97.9,19
2,1,3.0,58.1,19
3,1,4.0,78.2,19
4,1,0.0,9.0,19
...,...,...,...,...
32600,4760,1.0,18.5,156
32601,4762,,,553
32602,4763,,,12
32603,4767,,,442


In [7]:
school_ratings_by_county = school_data_df.groupby(['county_id']).mean()
school_ratings_by_county["zip_code"] = school_ratings_by_county.index

In [8]:
crime_rate_by_county = crime_data_df.groupby(['county_id']).mean()
crime_rate_by_county["zip_code"] = crime_rate_by_county.index

In [9]:
# school_ratings_by_county.drop(columns=["zipcode_id", "zip_code"])
del school_ratings_by_county["zipcode_id"]
del school_ratings_by_county["zip_code"]
school_ratings_by_county["county_id"] = school_ratings_by_county.index
display(school_ratings_by_county)

Unnamed: 0_level_0,schooldigger_rating,average_standard_score,county_id
county_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.680952,52.278333,1
2,2.882771,55.630810,2
3,2.803797,55.328070,3
4,2.691810,51.154310,4
5,2.985600,58.282080,5
...,...,...,...
8293,4.666667,88.666667,8293
8306,4.000000,72.666667,8306
8372,,,8372
8416,,,8416


In [10]:
del crime_rate_by_county["zipcode_id"]
del crime_rate_by_county["zip_code"]
crime_rate_by_county["county_id"] = crime_rate_by_county.index

display(crime_rate_by_county)

Unnamed: 0_level_0,violent_crime,property_crime,county_id
county_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,36.543777,43.851073,1
2,34.005319,37.355319,2
3,53.621429,63.846429,3
4,32.101408,44.761972,4
5,29.950000,36.481579,5
...,...,...,...
8306,15.400000,21.900000,8306
8372,20.700000,27.100000,8372
8416,27.950000,34.200000,8416
8417,51.000000,57.600000,8417


In [11]:
# school_ratings_by_zipcode[school_ratings_by_zipcode["zip_code"] == "4762"]

# zipcode_data_query = "SELECT zip_code, county_id from zipcode"

# zipcode_data_df = pd.read_sql(zipcode_data_query, con=connection)

# display(zipcode_data_df)

# # school_data_df.join(zipcode_data_df)
# display(school_ratings_by_zipcode)
# school_ratings_by_zipcode["zip_code"] = school_ratings_by_zipcode["zip_code"].astype(str)

# zipcode_data_df["zip_code"] = zipcode_data_df["zip_code"].astype(str)

# county_school_rating = pd.merge(school_ratings_by_zipcode, zipcode_data_df, how="outer")

# display(county_school_rating)

# county_school_rating[county_school_rating["zip_code"] == "4762"]

# school_data_df

# zipcode_data_df

# pd.merge(school_data_df, zipcode_data_df, how="inner", left_on="zipcode_id", right_on="zip_code")

In [12]:
# np.corrcoef(df["index_value"], school_ratings_by_county["schooldigger_rating"])
# dl school_ratings_by_county['index']
school_ratings_by_county = school_ratings_by_county.rename_axis(None)
final_school_rating_and_home_prices_df = pd.merge(school_ratings_by_county, df, how="inner")

In [13]:
crime_rate_by_county = crime_rate_by_county.rename_axis(None)
final_crime_rate_by_county_and_home_prices_df = pd.merge(crime_rate_by_county, df, how="inner")

In [14]:
display(final_crime_rate_by_county_and_home_prices_df)

Unnamed: 0,violent_crime,property_crime,county_id,home_type_id,year,month,index_value
0,36.543777,43.851073,1,4,1996,04,158900.0
1,36.543777,43.851073,1,4,1996,05,158900.0
2,36.543777,43.851073,1,4,1996,06,158900.0
3,36.543777,43.851073,1,4,1996,07,158900.0
4,36.543777,43.851073,1,4,1996,08,158800.0
...,...,...,...,...,...,...,...
1054957,5.000000,9.700000,8464,3,2019,05,73300.0
1054958,5.000000,9.700000,8464,3,2019,06,74900.0
1054959,5.000000,9.700000,8464,3,2019,07,76400.0
1054960,5.000000,9.700000,8464,3,2019,08,76800.0


In [15]:
display(final_school_rating_and_home_prices_df)

Unnamed: 0,schooldigger_rating,average_standard_score,county_id,home_type_id,year,month,index_value
0,2.680952,52.278333,1,4,1996,04,158900.0
1,2.680952,52.278333,1,4,1996,05,158900.0
2,2.680952,52.278333,1,4,1996,06,158900.0
3,2.680952,52.278333,1,4,1996,07,158900.0
4,2.680952,52.278333,1,4,1996,08,158800.0
...,...,...,...,...,...,...,...
1041421,2.333333,41.600000,8417,2,2019,05,77700.0
1041422,2.333333,41.600000,8417,2,2019,06,75700.0
1041423,2.333333,41.600000,8417,2,2019,07,75800.0
1041424,2.333333,41.600000,8417,2,2019,08,76100.0


In [16]:
# final_school_rating_and_home_prices_df[final_school_rating_and_home_prices_df]
final_school_rating_and_home_prices_df[final_school_rating_and_home_prices_df.isnull().any(axis=1)]

Unnamed: 0,schooldigger_rating,average_standard_score,county_id,home_type_id,year,month,index_value
554694,,,342,4,1996,04,93400.0
554695,,,342,4,1996,05,93400.0
554696,,,342,4,1996,06,93400.0
554697,,,342,4,1996,07,93300.0
554698,,,342,4,1996,08,93100.0
...,...,...,...,...,...,...,...
1040011,,,8372,7,2019,05,98000.0
1040012,,,8372,7,2019,06,100200.0
1040013,,,8372,7,2019,07,102500.0
1040014,,,8372,7,2019,08,104100.0


In [17]:
final_school_rating_and_home_prices_df = final_school_rating_and_home_prices_df.dropna()

In [18]:
final_crime_rate_by_county_and_home_prices_df = final_crime_rate_by_county_and_home_prices_df.dropna()

In [19]:
# final_school_rating_and_home_prices_df[final_school_rating_and_home_prices_df]
final_school_rating_and_home_prices_df[final_school_rating_and_home_prices_df.isnull().any(axis=1)]

Unnamed: 0,schooldigger_rating,average_standard_score,county_id,home_type_id,year,month,index_value


In [20]:
np.corrcoef(final_school_rating_and_home_prices_df["schooldigger_rating"], final_school_rating_and_home_prices_df["index_value"])

array([[1.        , 0.09977237],
       [0.09977237, 1.        ]])

In [21]:
np.corrcoef(final_school_rating_and_home_prices_df["average_standard_score"], final_school_rating_and_home_prices_df["average_standard_score"])

array([[1., 1.],
       [1., 1.]])

In [22]:
display(final_crime_rate_by_county_and_home_prices_df)

Unnamed: 0,violent_crime,property_crime,county_id,home_type_id,year,month,index_value
0,36.543777,43.851073,1,4,1996,04,158900.0
1,36.543777,43.851073,1,4,1996,05,158900.0
2,36.543777,43.851073,1,4,1996,06,158900.0
3,36.543777,43.851073,1,4,1996,07,158900.0
4,36.543777,43.851073,1,4,1996,08,158800.0
...,...,...,...,...,...,...,...
1054957,5.000000,9.700000,8464,3,2019,05,73300.0
1054958,5.000000,9.700000,8464,3,2019,06,74900.0
1054959,5.000000,9.700000,8464,3,2019,07,76400.0
1054960,5.000000,9.700000,8464,3,2019,08,76800.0


In [23]:
np.corrcoef(final_crime_rate_by_county_and_home_prices_df["violent_crime"], final_crime_rate_by_county_and_home_prices_df["index_value"])


array([[ 1.        , -0.04955657],
       [-0.04955657,  1.        ]])

In [24]:
np.corrcoef(final_crime_rate_by_county_and_home_prices_df["property_crime"], final_crime_rate_by_county_and_home_prices_df["index_value"])

array([[ 1.        , -0.04632985],
       [-0.04632985,  1.        ]])

In [25]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from math import sqrt

regr = RandomForestRegressor(max_depth=None, random_state=0,
                             n_estimators=100)


x_data = final_school_rating_and_home_prices_df.loc[:, final_school_rating_and_home_prices_df.columns != "index_value"]
y_data = final_school_rating_and_home_prices_df.loc[:, "index_value"]

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, shuffle=True, random_state=random_state)

In [27]:
x_data.size, x_train.size, x_test.size

(6197796, 4648344, 1549452)

In [28]:
regr.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [29]:
y_pred = regr.predict(x_train)
rmse_random_forest_train = sqrt(mean_squared_error(y_train, y_pred))
print(rmse_random_forest_train)
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_train.index.copy())
df_out_random_forest_train = pd.merge(final_school_rating_and_home_prices_df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(df_out_random_forest_train)

2850.159085667408


Unnamed: 0,schooldigger_rating,average_standard_score,county_id,home_type_id,year,month,index_value,y_pred
0,2.680952,52.278333,1,4,1996,04,158900.0,158923.0
1,2.680952,52.278333,1,4,1996,05,158900.0,158923.0
2,2.680952,52.278333,1,4,1996,06,158900.0,158923.0
3,2.680952,52.278333,1,4,1996,07,158900.0,158903.0
4,2.680952,52.278333,1,4,1996,08,158800.0,158850.0
...,...,...,...,...,...,...,...,...
1041421,2.333333,41.600000,8417,2,2019,05,77700.0,78169.0
1041422,2.333333,41.600000,8417,2,2019,06,75700.0,76564.0
1041423,2.333333,41.600000,8417,2,2019,07,75800.0,76420.0
1041424,2.333333,41.600000,8417,2,2019,08,76100.0,


In [30]:
y_pred = regr.predict(x_test)
rmse_random_forest_test = sqrt(mean_squared_error(y_test, y_pred))
print(rmse_random_forest_test)
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_test.index.copy())
df_out_random_forest_test = pd.merge(final_school_rating_and_home_prices_df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(df_out_random_forest_test)

5817.092150208564


Unnamed: 0,schooldigger_rating,average_standard_score,county_id,home_type_id,year,month,index_value,y_pred
0,2.680952,52.278333,1,4,1996,04,158900.0,
1,2.680952,52.278333,1,4,1996,05,158900.0,
2,2.680952,52.278333,1,4,1996,06,158900.0,
3,2.680952,52.278333,1,4,1996,07,158900.0,
4,2.680952,52.278333,1,4,1996,08,158800.0,
...,...,...,...,...,...,...,...,...
1041421,2.333333,41.600000,8417,2,2019,05,77700.0,
1041422,2.333333,41.600000,8417,2,2019,06,75700.0,
1041423,2.333333,41.600000,8417,2,2019,07,75800.0,
1041424,2.333333,41.600000,8417,2,2019,08,76100.0,78000.0


In [31]:
final_df_out = df_out_random_forest_train['y_pred'].combine_first(df_out_random_forest_test['y_pred'])

display(final_df_out)

0          158923.0
1          158923.0
2          158923.0
3          158903.0
4          158850.0
             ...   
1041421     78169.0
1041422     76564.0
1041423     76420.0
1041424     78000.0
1041425     79289.0
Name: y_pred, Length: 1032966, dtype: float64

In [32]:
y_hats_df = pd.DataFrame(data = final_df_out, columns = ['y_pred'], index = df.index.copy())
display(y_hats_df)
final_df_out = pd.merge(final_school_rating_and_home_prices_df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(final_df_out)

Unnamed: 0,y_pred
0,158923.0
1,158923.0
2,158923.0
3,158903.0
4,158850.0
...,...
2327623,
2327624,
2327625,
2327626,


Unnamed: 0,schooldigger_rating,average_standard_score,county_id,home_type_id,year,month,index_value,y_pred
0,2.680952,52.278333,1,4,1996,04,158900.0,158923.0
1,2.680952,52.278333,1,4,1996,05,158900.0,158923.0
2,2.680952,52.278333,1,4,1996,06,158900.0,158923.0
3,2.680952,52.278333,1,4,1996,07,158900.0,158903.0
4,2.680952,52.278333,1,4,1996,08,158800.0,158850.0
...,...,...,...,...,...,...,...,...
1041421,2.333333,41.600000,8417,2,2019,05,77700.0,78169.0
1041422,2.333333,41.600000,8417,2,2019,06,75700.0,76564.0
1041423,2.333333,41.600000,8417,2,2019,07,75800.0,76420.0
1041424,2.333333,41.600000,8417,2,2019,08,76100.0,78000.0


In [37]:
regr = RandomForestRegressor(max_depth=None, random_state=0,
                             n_estimators=100)


x_data = final_crime_rate_by_county_and_home_prices_df.loc[:, final_crime_rate_by_county_and_home_prices_df.columns != "index_value"]
y_data = final_crime_rate_by_county_and_home_prices_df.loc[:, "index_value"]

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, shuffle=True, random_state=random_state)

In [39]:
x_data.size, x_train.size, x_test.size

(6329772, 4747326, 1582446)

In [40]:
regr.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [41]:
y_pred = regr.predict(x_train)
rmse_random_forest_train = sqrt(mean_squared_error(y_train, y_pred))
print(rmse_random_forest_train)
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_train.index.copy())
df_out_random_forest_train = pd.merge(final_crime_rate_by_county_and_home_prices_df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(df_out_random_forest_train)


y_pred = regr.predict(x_test)
rmse_random_forest_test = sqrt(mean_squared_error(y_test, y_pred))
print(rmse_random_forest_test)
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_test.index.copy())
df_out_random_forest_test = pd.merge(final_crime_rate_by_county_and_home_prices_df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(df_out_random_forest_test)


final_df_out_2 = df_out_random_forest_train['y_pred'].combine_first(df_out_random_forest_test['y_pred'])

display(final_df_out_2)

y_hats_df = pd.DataFrame(data = final_df_out_2, columns = ['y_pred'], index = df.index.copy())
display(y_hats_df)
final_df_out_2 = pd.merge(final_crime_rate_by_county_and_home_prices_df, y_hats_df, how = 'left', left_index = True, right_index = True)

display(final_df_out_2)

3059.0104280803243


Unnamed: 0,violent_crime,property_crime,county_id,home_type_id,year,month,index_value,y_pred
0,36.543777,43.851073,1,4,1996,04,158900.0,158900.0
1,36.543777,43.851073,1,4,1996,05,158900.0,158900.0
2,36.543777,43.851073,1,4,1996,06,158900.0,158900.0
3,36.543777,43.851073,1,4,1996,07,158900.0,158890.0
4,36.543777,43.851073,1,4,1996,08,158800.0,158833.0
...,...,...,...,...,...,...,...,...
1054957,5.000000,9.700000,8464,3,2019,05,73300.0,73104.0
1054958,5.000000,9.700000,8464,3,2019,06,74900.0,74416.0
1054959,5.000000,9.700000,8464,3,2019,07,76400.0,75891.0
1054960,5.000000,9.700000,8464,3,2019,08,76800.0,76520.0


8034.211022508337


Unnamed: 0,violent_crime,property_crime,county_id,home_type_id,year,month,index_value,y_pred
0,36.543777,43.851073,1,4,1996,04,158900.0,
1,36.543777,43.851073,1,4,1996,05,158900.0,
2,36.543777,43.851073,1,4,1996,06,158900.0,
3,36.543777,43.851073,1,4,1996,07,158900.0,
4,36.543777,43.851073,1,4,1996,08,158800.0,
...,...,...,...,...,...,...,...,...
1054957,5.000000,9.700000,8464,3,2019,05,73300.0,
1054958,5.000000,9.700000,8464,3,2019,06,74900.0,
1054959,5.000000,9.700000,8464,3,2019,07,76400.0,
1054960,5.000000,9.700000,8464,3,2019,08,76800.0,


0          158900.0
1          158900.0
2          158900.0
3          158890.0
4          158833.0
             ...   
1054957     73104.0
1054958     74416.0
1054959     75891.0
1054960     76520.0
1054961     76668.0
Name: y_pred, Length: 1054962, dtype: float64

Unnamed: 0,y_pred
0,158900.0
1,158900.0
2,158900.0
3,158890.0
4,158833.0
...,...
2327623,
2327624,
2327625,
2327626,


Unnamed: 0,violent_crime,property_crime,county_id,home_type_id,year,month,index_value,y_pred
0,36.543777,43.851073,1,4,1996,04,158900.0,158900.0
1,36.543777,43.851073,1,4,1996,05,158900.0,158900.0
2,36.543777,43.851073,1,4,1996,06,158900.0,158900.0
3,36.543777,43.851073,1,4,1996,07,158900.0,158890.0
4,36.543777,43.851073,1,4,1996,08,158800.0,158833.0
...,...,...,...,...,...,...,...,...
1054957,5.000000,9.700000,8464,3,2019,05,73300.0,73104.0
1054958,5.000000,9.700000,8464,3,2019,06,74900.0,74416.0
1054959,5.000000,9.700000,8464,3,2019,07,76400.0,75891.0
1054960,5.000000,9.700000,8464,3,2019,08,76800.0,76520.0


In [59]:
school_and_crime_home_price_prediction = pd.merge(school_ratings_by_county, crime_rate_by_county, how="inner")

In [61]:
school_and_crime_home_price_prediction = pd.merge(school_and_crime_home_price_prediction, df, how="inner")

In [62]:
school_and_crime_home_price_prediction

Unnamed: 0,schooldigger_rating,average_standard_score,county_id,violent_crime,property_crime,home_type_id,year,month,index_value
0,2.680952,52.278333,1,36.543777,43.851073,4,1996,04,158900.0
1,2.680952,52.278333,1,36.543777,43.851073,4,1996,05,158900.0
2,2.680952,52.278333,1,36.543777,43.851073,4,1996,06,158900.0
3,2.680952,52.278333,1,36.543777,43.851073,4,1996,07,158900.0
4,2.680952,52.278333,1,36.543777,43.851073,4,1996,08,158800.0
...,...,...,...,...,...,...,...,...,...
1041421,2.333333,41.600000,8417,51.000000,57.600000,2,2019,05,77700.0
1041422,2.333333,41.600000,8417,51.000000,57.600000,2,2019,06,75700.0
1041423,2.333333,41.600000,8417,51.000000,57.600000,2,2019,07,75800.0
1041424,2.333333,41.600000,8417,51.000000,57.600000,2,2019,08,76100.0


In [63]:
print(school_and_crime_home_price_prediction.shape)
school_and_crime_home_price_prediction.dropna()
print(school_and_crime_home_price_prediction.shape)

(1041426, 9)
(1041426, 9)


In [72]:
regr = RandomForestRegressor(max_depth=None, random_state=0,
                             n_estimators=100)
def clean_dataset(df):
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)  # convert to float64 to avoid error

school_and_crime_home_price_prediction = clean_dataset(school_and_crime_home_price_prediction)

x_data = school_and_crime_home_price_prediction.loc[:, school_and_crime_home_price_prediction.columns != "index_value"]
y_data = school_and_crime_home_price_prediction.loc[:, "index_value"]

In [73]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.25, shuffle=True, random_state=random_state)
print(x_data.size, x_train.size, x_test.size)

8263728 6197792 2065936


In [74]:
regr.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [76]:
y_pred = regr.predict(x_train)
rmse_random_forest_train = sqrt(mean_squared_error(y_train, y_pred))
print(rmse_random_forest_train)
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_train.index.copy())
df_out_random_forest_train = pd.merge(school_and_crime_home_price_prediction, y_hats_df, how = 'left', left_index = True, right_index = True)

display(df_out_random_forest_train)


y_pred = regr.predict(x_test)
rmse_random_forest_test = sqrt(mean_squared_error(y_test, y_pred))
print(rmse_random_forest_test)
y_hats_df = pd.DataFrame(data = y_pred, columns = ['y_pred'], index = x_test.index.copy())
df_out_random_forest_test = pd.merge(school_and_crime_home_price_prediction, y_hats_df, how = 'left', left_index = True, right_index = True)

display(df_out_random_forest_test)


final_df_out_3 = df_out_random_forest_train['y_pred'].combine_first(df_out_random_forest_test['y_pred'])

display(final_df_out_3)

y_hats_df = pd.DataFrame(data = final_df_out_3, columns = ['y_pred'], index = df.index.copy())
display(y_hats_df)
final_df_out_3 = pd.merge(school_and_crime_home_price_prediction, y_hats_df, how = 'left', left_index = True, right_index = True)

display(final_df_out_3)

2815.508733050922


Unnamed: 0,schooldigger_rating,average_standard_score,county_id,violent_crime,property_crime,home_type_id,year,month,index_value,y_pred
0,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,4.0,158900.0,158898.0
1,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,5.0,158900.0,158898.0
2,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,6.0,158900.0,158898.0
3,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,7.0,158900.0,158889.0
4,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,8.0,158800.0,158836.0
...,...,...,...,...,...,...,...,...,...,...
1041421,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,5.0,77700.0,78063.0
1041422,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,6.0,75700.0,76584.0
1041423,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,7.0,75800.0,76327.0
1041424,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,8.0,76100.0,


5745.444611772696


Unnamed: 0,schooldigger_rating,average_standard_score,county_id,violent_crime,property_crime,home_type_id,year,month,index_value,y_pred
0,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,4.0,158900.0,
1,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,5.0,158900.0,
2,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,6.0,158900.0,
3,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,7.0,158900.0,
4,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,8.0,158800.0,
...,...,...,...,...,...,...,...,...,...,...
1041421,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,5.0,77700.0,
1041422,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,6.0,75700.0,
1041423,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,7.0,75800.0,
1041424,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,8.0,76100.0,77647.0


0          158898.0
1          158898.0
2          158898.0
3          158889.0
4          158836.0
             ...   
1041421     78063.0
1041422     76584.0
1041423     76327.0
1041424     77647.0
1041425     79303.0
Name: y_pred, Length: 1032966, dtype: float64

Unnamed: 0,y_pred
0,158898.0
1,158898.0
2,158898.0
3,158889.0
4,158836.0
...,...
2327623,
2327624,
2327625,
2327626,


Unnamed: 0,schooldigger_rating,average_standard_score,county_id,violent_crime,property_crime,home_type_id,year,month,index_value,y_pred
0,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,4.0,158900.0,158898.0
1,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,5.0,158900.0,158898.0
2,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,6.0,158900.0,158898.0
3,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,7.0,158900.0,158889.0
4,2.680952,52.278333,1.0,36.543777,43.851073,4.0,1996.0,8.0,158800.0,158836.0
...,...,...,...,...,...,...,...,...,...,...
1041421,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,5.0,77700.0,78063.0
1041422,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,6.0,75700.0,76584.0
1041423,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,7.0,75800.0,76327.0
1041424,2.333333,41.600000,8417.0,51.000000,57.600000,2.0,2019.0,8.0,76100.0,77647.0
