In [46]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [56]:
data2014 = pd.read_csv("finalized/2014-scores.csv")
data2016 = pd.read_csv("finalized/2016-scores.csv")
data2018 = pd.read_csv("finalized/2018-scores.csv")

df2018 = data2018[data2018.columns.difference(['census_tract', 'full_name'])]
df2016 = data2016[data2016.columns.difference(['census_tract', 'full_name', 'score'])]
df2014 = data2014[data2014.columns.difference(['census_tract', 'full_name', 'score'])]

In [57]:
df2018 = df2018.rename(columns={"associates": "associates2018", "bachelors": "bachelors2018","car": "car2018","diff_countyHouse": "diff_countyHouse2018","diff_countyRent": "diff_countyRent2018","diff_stateHouse": "diff_stateHouse2018","diff_stateRent": "diff_stateRent2018","health_ins": "health_ins2018","house_val": "house_val2018","museums": "museums2018","pharmacies": "pharmacies2018","population": "population2018","poverty": "poverty2018","self_income": "self_income2018"})

In [58]:
rename = lambda year, dataframe: dict(zip(list(dataframe.columns), [s + str(year) for s in list(dataframe.columns)]))

In [59]:
df2016 = df2016.rename(columns=rename(2016, df2016))
df2014 = df2014.rename(columns=rename(2014, df2014))

In [60]:
df2016.head()

Unnamed: 0,associates2016,bachelors2016,car2016,diff_countyHouse2016,diff_countyRent2016,diff_stateHouse2016,diff_stateRent2016,health_ins2016,house_val2016,museums2016,pharmacies2016,population2016,poverty2016,self_income2016
0,-1.124448,-1.18447,-1.138599,-0.61978,-0.56473,-0.55835,0.067856,-1.112696,-0.763971,0.312683,-0.308789,-1.138599,0.84499,-0.941716
1,0.070396,-0.31831,1.072695,1.396378,-0.718012,-0.691798,-0.408449,1.101662,-0.52914,-0.125641,1.764868,1.072695,1.804083,-0.486864
2,1.393259,0.447909,1.146762,0.01387,0.753498,-0.398213,1.169311,1.175742,-0.182118,-0.710074,0.579921,1.146762,0.481123,0.278114
3,-1.31221,-0.942945,-0.654476,-0.696586,-0.334806,-0.691798,-0.155412,-0.625806,-1.297428,1.33544,-0.308789,-0.654476,0.680934,-1.262179
4,2.016284,1.118351,1.993709,-0.696586,5.275326,-0.344834,0.425084,2.022835,-0.024831,-0.417858,0.283684,1.993709,0.622042,0.495202


In [61]:
df2014 = df2014.drop([2006, 2007])

In [62]:
print(df2016.columns)
print(df2014.shape)

Index(['associates2016', 'bachelors2016', 'car2016', 'diff_countyHouse2016',
       'diff_countyRent2016', 'diff_stateHouse2016', 'diff_stateRent2016',
       'health_ins2016', 'house_val2016', 'museums2016', 'pharmacies2016',
       'population2016', 'poverty2016', 'self_income2016'],
      dtype='object')
(2006, 14)


In [63]:
final = np.concatenate([df2016.to_numpy(), df2014.to_numpy()], axis = 1)
final.shape

(2006, 28)

In [64]:
y = df2018['score']

In [65]:
data_dmatrix = xgb.DMatrix(data=final,label=y)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(final, y, test_size=0.1, random_state=170)

#don't use split, just assign them
#X_train = 2012 and 2014 data
#X_test = 2014 and 2016 data
#y_train = 2016 scores
#y_test = 2018 scores

In [67]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [76]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(final) #this outputting 2018 so if i want to predict 2020, then use 2016 and 2018 data



In [78]:
print(preds)

[10.345287  25.277203  26.673428  ...  9.46321    0.5408283  0.5408283]


In [69]:
rmse = np.sqrt(mean_squared_error(y_test, preds)) #this should be 2018
print("RMSE: %f" % (rmse))

RMSE: 10.642585


In [72]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=100,early_stopping_rounds=10,metrics="rmse", seed=170)



In [73]:
print(cv_results)
#show RMSE for within the years and the future

    train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0         27.125599        0.152796       27.141649       0.345074
1         24.483765        0.139472       24.509596       0.312929
2         22.102930        0.128729       22.133384       0.286921
3         19.967129        0.119751       20.004251       0.261701
4         18.042458        0.108152       18.091891       0.244328
..              ...             ...             ...            ...
95         1.204510        0.007932        2.162026       0.259684
96         1.200571        0.007273        2.160695       0.259135
97         1.196606        0.006896        2.160219       0.259538
98         1.191295        0.007231        2.159124       0.258978
99         1.186768        0.005943        2.157475       0.258611

[100 rows x 4 columns]
