In [None]:
# Random Forest: Out-of-Sample

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
city = 'la'
nb = 'zipcode'
file = '../../Data/data_'+city+'.csv'
data = pd.read_csv(file)

In [3]:
print(len(data))
data = data[data['disadvantaged']==True]
print(len(data))
data = data[data['count_listings']>=5]
print(len(data))

130
68
58


In [4]:
data.head()

Unnamed: 0,zipcode,bedrooms,bedrooms_log,beds,beds_log,person_capacity,person_capacity_log,price,price_log,star_rating,...,t2_index,t1_index_perc,t2_index_perc,age_change,income_change,house_change,edu_change,index_change,disadvantaged,gentrifying
8,90021.0,1.019417,0.019231,1.623188,0.484392,4.719807,1.551768,189.835749,5.246159,4.612069,...,6.578947,3.508772,2.807018,-29.122807,0.701754,0.350877,-9.473684,-0.701754,True,False
10,90019.0,1.262758,0.233298,1.941368,0.663393,3.388949,1.22052,106.390033,4.667112,4.70155,...,49.824561,36.491228,48.421053,0.0,9.824561,6.666667,10.526316,11.929825,True,True
14,90026.0,1.239836,0.214979,1.60236,0.471478,3.049864,1.115097,119.93835,4.786978,4.78098,...,58.421053,31.22807,64.210526,9.122807,23.859649,17.894737,23.508772,32.982456,True,True
15,90028.0,1.039799,0.039028,1.833959,0.606477,3.533396,1.262259,128.209114,4.853663,4.620887,...,55.0,41.052632,60.701754,0.350877,7.719298,12.631579,15.087719,19.649123,True,True
16,90042.0,1.283582,0.249655,1.635514,0.491957,3.097015,1.130439,100.479478,4.609954,4.835385,...,48.070175,32.280702,44.736842,10.175439,5.964912,6.315789,7.368421,12.45614,True,True


In [5]:
sf = [
'bedrooms','price','star_rating','review_rating_location',
'count_listings',
'count_reviews',
]

In [6]:
uf_all = [
'review_len',
'location_words_perc','sent_comp','sent_comp_location',
'dtv_1', 'dtv_2', 'dtv_3', 'dtv_4', 'dtv_5', 'dtv_6', 'dtv_7', 'dtv_8', 'dtv_9', 'dtv_10', 'dtv_11', 'dtv_12', 'dtv_13', 'dtv_14', 'dtv_15', 'dtv_16', 'dtv_17', 'dtv_18', 'dtv_19', 'dtv_20', 'dtv_21', 'dtv_22', 'dtv_23', 'dtv_24', 'dtv_25',
'lda_1', 'lda_2', 'lda_3', 'lda_4', 'lda_5',
]

In [7]:
uf_base = [
'review_len',
'location_words_perc','sent_comp','sent_comp_location',
]

In [8]:
def adjusted_r2(r2, shape):
    n = shape[0]
    p = shape[1]
    
    ar2 = 1-((1-r2)*((n-1)/(n-p-1)))
    return ar2

In [9]:
def calculate_r2(pred, real):
    pred = np.array(pred)
    real = np.array(real)
    
    y_bar = np.mean(real)
    ss_total = np.sum(np.square(real-y_bar))
    ss_res = np.sum(np.square(real-pred))
    
    return 1 - (ss_res/ss_total)

# Model

In [10]:
def model(key, X, y, parameters,chosen):
    base = RandomForestRegressor(max_features='sqrt')
    grid = GridSearchCV(base, parameters, cv=3)
    grid.fit(X, y)
    
    best = grid.best_params_
    for k,v in best.items():
        chosen[key][k].append(v)

    reg = RandomForestRegressor(n_estimators=best['n_estimators'], max_depth=best['max_depth'], max_features='sqrt')
    reg.fit(X, y)
    return reg

In [11]:
def update(key, reg, X, y, mae, mse, r2, importance):
    test_r2[key].append(reg.score(X, y))
    test_mae[key].append(mean_absolute_error(reg.predict(X), y))
    test_mse[key].append(mean_squared_error(reg.predict(X), y))
    importance[key][:,i] = reg.feature_importances_

In [12]:
def top_dtv(X,y):
    scores = {}
    for i in range(1,26):
        f = 'dtv_'+str(i)
        scores[f] = abs(stats.pearsonr(X[f], y)[0])
    scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    return list(scores.keys())  

In [13]:
def top_lda(X,y):
    scores = {}
    for i in range(1,6):
        f = 'lda_'+str(i)
        scores[f] = abs(stats.pearsonr(X[f], y)[0])
    scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}
    return list(scores.keys())  

In [14]:
def VIF_scores(X):
    scores = {}
    for c in X.columns:
        X_temp = X.drop(columns=c)
        reg = LinearRegression().fit(X_temp, X[c])
        r2 = reg.score(X_temp, X[c])
        if r2 != 1:
            scores[c] = 1/(1-r2)
        else:
            scores[c] = np.inf
    return scores

In [15]:
def VIF_high(X):
    for c in X.columns:
        X_temp = X.drop(columns=c)
        reg = LinearRegression().fit(X_temp, X[c])
        r2 = reg.score(X_temp, X[c])
        if r2 != 1:
            score = 1/(1-r2)
            if score>=5:
                return True
        else:
            return True
    return False

In [17]:
pred = 'index_change'
trials = 100

dtv_keep = 5
lda_keep = 5

test_mae = {'base':[], 'sf':[], 'uf': [], 'all':[]}
test_mse = {'base':[], 'sf':[], 'uf': [], 'all':[]}
test_r2 = {'base':[], 'sf':[], 'uf': [], 'all':[]}


importance = {'sf':np.zeros((len(sf),trials)), 'uf':np.zeros((len(uf_base)+dtv_keep+lda_keep,trials)), 'all':np.zeros((len(sf)+len(uf_base)+dtv_keep+lda_keep,trials))}

parameters = {'n_estimators':[100],'max_depth': [None]}
chosen = {'sf':{},'uf':{},'all':{}}

for p in parameters.keys():
    for i in chosen.keys():
        chosen[i][p] = []

for i in tqdm(range(trials)):
    X_train, X_test, y_train, y_test = train_test_split(data[sf+uf_all], data[pred], test_size=1/2, shuffle=True)
    
    # Choose Top LDA and D2V
    dtv_ranked = top_dtv(X_train,y_train)
    dtv = []
    for f in dtv_ranked:
        dtv.append(f)
        if len(dtv)>=dtv_keep:
            break
    
    lda_ranked = top_lda(X_train,y_train)
    lda = []
    for f in lda_ranked:
        lda.append(f)
        if len(lda)>=lda_keep:
            break

    uf = uf_base + dtv + lda
    
    X_train_sf = X_train[sf]
    X_train_uf = X_train[uf]
    X_test_sf = X_test[sf]
    X_test_uf = X_test[uf]
    X_train = X_train[sf+uf]
    X_test = X_test[sf+uf]
    
    # Baseline
    base = [0]*len(y_test)
    test_r2['base'].append(calculate_r2(base, y_test))
    test_mae['base'].append(mean_absolute_error(base, y_test))
    test_mse['base'].append(mean_squared_error(base, y_test))
    
    # Structured Features
    reg = model('sf', X_train_sf, y_train, parameters, chosen)
    update('sf', reg, X_test_sf, y_test, test_mae, test_mse, test_r2, importance)
    
    # Unstructured Features
    reg = model('uf', X_train_uf, y_train, parameters, chosen)
    update('uf', reg, X_test_uf, y_test, test_mae, test_mse, test_r2, importance)

    # All Features
    reg = model('all', X_train, y_train, parameters, chosen)
    update('all', reg, X_test, y_test, test_mae, test_mse, test_r2, importance)

100%|██████████| 100/100 [02:08<00:00,  1.28s/it]


In [18]:
for k in ['base', 'sf', 'uf', 'all']:
    print(k)
    if k != 'base':
        print(np.mean(test_r2[k]), np.std(test_r2[k]), adjusted_r2(np.mean(test_r2[k]), (len(data)/2, importance[k].shape[0])))
    else:
        print(np.mean(test_r2[k]), np.std(test_r2[k]))       
    print(np.mean(test_mae[k]), np.std(test_mae[k]))
    print(np.mean(np.sqrt(test_mse[k])),np.std(np.sqrt(test_mse[k])))
    print()

base
-0.1882210400321788 0.07536644091541186
9.385783424077434 1.6602562382312522
14.98540776830452 4.1624636069856935

sf
-0.04947393208739757 0.3303327877750238 -0.33569409538396044
8.26008166969147 1.2919231472286308
13.848551434338829 3.871533941087701

uf
0.004307130857802456 0.30841183208290834 -0.9913857382843951
9.096280096793711 1.320525555053482
13.244032135769764 2.9055311907395556

all
0.05313570104966775 0.23357615088210862 -2.3140250463261625
8.45865638233515 1.2798567536282877
13.174715883380367 3.475637980642623



In [18]:
for k in ['sf', 'uf', 'all']:
    if k == 'sf':
        features = sf
    elif k == 'uf':
        features = uf_base.copy()
        for i in range(dtv_keep):
            features.append('dtv_'+str(i))
        for i in range(lda_keep):
            features.append('lda_'+str(i))
    elif k == 'all':
        features = sf + uf_base.copy()
        for i in range(dtv_keep):
            features.append('dtv_'+str(i))
        for i in range(lda_keep):
            features.append('lda_'+str(i))
        
    df = {'feature':[], 'importance':[], 'std':[]}
    for i,f in enumerate(features):
        df['feature'].append(f)
        df['importance'].append(np.mean(importance[k][i, :]))
        df['std'].append(np.std(importance[k][i, :]))
    df = pd.DataFrame.from_dict(df)
    df = df.sort_values(by=['importance'], ascending=False)
    print(k)
    print(df.head(10))

sf
                  feature  importance       std
4          count_listings    0.250006  0.036324
5           count_reviews    0.217429  0.035269
3  review_rating_location    0.165038  0.051945
0                bedrooms    0.157711  0.062293
1                   price    0.111544  0.022603
2             star_rating    0.098271  0.022127
uf
                feature  importance       std
4                 dtv_0    0.098327  0.023616
9                 lda_0    0.092296  0.029898
5                 dtv_1    0.090497  0.024179
6                 dtv_2    0.085103  0.021580
8                 dtv_4    0.084231  0.026422
1   location_words_perc    0.081644  0.025609
3    sent_comp_location    0.080965  0.025843
7                 dtv_3    0.079869  0.025354
10                lda_1    0.079156  0.024612
11                lda_2    0.052389  0.015801
all
                   feature  importance       std
4           count_listings    0.114896  0.028238
5            count_reviews    0.100335  0.027929
3