In [1]:
# File: correlation.ipynb -- Correlation: Features <=> Prediction Data
# Author: Shomik Jain
# Date: 2/02/2020

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import math

In [3]:
file = '../data/regression_data.csv'
data = pd.read_csv(file)

In [4]:
# Between All Features

features = [
'listings_count_log',
'agg_count_log',
'price_log',
'review_rating_location',
'square_feet_log',
'person_capacity',
'review_length_log',
'crime_words_perc_old',
'sentiment_neg_avg',
'lda_pca1',
'lda_pca2',
'd2v_pca1',
'd2v_pca2', 
]

for f in features:
    for f2 in features:
        if f == f2:
            continue
        print(f, f2, stats.pearsonr(data[f], data[f2]))

listings_count_log agg_count_log (0.905672842276378, 3.1810324867512335e-302)
listings_count_log price_log (0.42596726501111043, 6.580180418502108e-37)
listings_count_log review_rating_location (0.22818676773472635, 5.4189204475089946e-11)
listings_count_log square_feet_log (-0.07263429863971645, 0.03912178410065565)
listings_count_log person_capacity (-0.010935270246944365, 0.7564285428404702)
listings_count_log review_length_log (0.15153326487788774, 1.538983725914427e-05)
listings_count_log crime_words_perc_old (0.7794847878478284, 1.0819781743014118e-165)
listings_count_log sentiment_neg_avg (0.054120592332816415, 0.12449255825867232)
listings_count_log lda_pca1 (-0.37665507956442, 1.3337590588658306e-28)
listings_count_log lda_pca2 (-0.3271548175836733, 1.3853518525685724e-21)
listings_count_log d2v_pca1 (0.4116455340670486, 2.377288210607371e-34)
listings_count_log d2v_pca2 (0.028212715259475073, 0.4234926061269234)
agg_count_log listings_count_log (0.905672842276378, 3.181032486

In [5]:
data.columns

Index(['year', 'zipcode', 'agg_count_log', 'price_log', 'person_capacity',
       'review_rating_location', 'review_length_log', 'crime_words_perc_old',
       'sentiment_neg_avg', 'lda2', 'lda3', 'lda4', 'lda5', 'lda_pca1',
       'lda_pca2', 'd2v_pca1', 'd2v_pca2', 'crime_score', 'crime_score_log',
       'race_index', 'gini_index', 'edu_bachelors', 'age_25_34',
       'square_feet_log', 'year_2012', 'year_2013', 'year_2014', 'year_2015',
       'year_2016', 'year_2017', 'listings_count_log', 'gentrifying',
       'non_gentrifying', 'higher_income', 'crime_score_norm',
       'gentrification_score'],
      dtype='object')

In [6]:
pred = ['gentrification_score']

In [7]:
num_df = pd.DataFrame(columns=['feature']+pred)
r_df = pd.DataFrame(columns=['feature']+pred)
pr_df = pd.DataFrame(columns=['feature']+pred)
s_df = pd.DataFrame(columns=['feature']+pred)
ps_df = pd.DataFrame(columns=['feature']+pred)

In [8]:
for f in features:
    new_num = {'feature':f}
    new_r = {'feature':f}
    new_pr = {'feature':f}
    new_s = {'feature':f}
    new_ps = {'feature':f}
    
    print(f)
    for p in pred:
        curr = data.loc[data[f].notna() & data[p].notna()]
        x = curr[f].values
        y = curr[p].values
        n = len(curr)
        
        r, pr = stats.pearsonr(x, y)

        s, ps = stats.spearmanr(x, y)
        
        new_num[p] = n
        new_r[p] = r
        new_pr[p] = pr
        new_s[p] = s
        new_ps[p] = ps
        
    num_df = num_df.append(new_num, ignore_index=True)
    r_df = r_df.append(new_r, ignore_index=True)
    pr_df = pr_df.append(new_pr, ignore_index=True)
    s_df = s_df.append(new_s, ignore_index=True)
    ps_df = ps_df.append(new_ps, ignore_index=True)

listings_count_log
agg_count_log
price_log
review_rating_location
square_feet_log
person_capacity
review_length_log
crime_words_perc_old
sentiment_neg_avg
lda_pca1
lda_pca2
d2v_pca1
d2v_pca2


In [9]:
# Pearson's Linear Correlation Coefficient

r_df

Unnamed: 0,feature,gentrification_score
0,listings_count_log,0.201684
1,agg_count_log,0.250985
2,price_log,0.723018
3,review_rating_location,0.719863
4,square_feet_log,0.101439
5,person_capacity,0.03889
6,review_length_log,0.073817
7,crime_words_perc_old,0.291775
8,sentiment_neg_avg,-0.198509
9,lda_pca1,-0.551582


In [10]:
# Pearson's Linear Correlation Coefficient: P-Values

pr_df

Unnamed: 0,feature,gentrification_score
0,listings_count_log,7.469268e-09
1,agg_count_log,4.655609e-13
2,price_log,1.917887e-131
3,review_rating_location,8.791321e-130
4,square_feet_log,0.003918609
5,person_capacity,0.2698155
6,review_length_log,0.03602908
7,crime_words_perc_old,2.6598790000000002e-17
8,sentiment_neg_avg,1.291895e-08
9,lda_pca1,1.974539e-65


In [11]:
# Spearman's Correlation Coefficient

s_df

Unnamed: 0,feature,gentrification_score
0,listings_count_log,0.225571
1,agg_count_log,0.251679
2,price_log,0.654921
3,review_rating_location,0.745172
4,square_feet_log,-0.002148
5,person_capacity,0.099319
6,review_length_log,0.060764
7,crime_words_perc_old,0.285778
8,sentiment_neg_avg,-0.215846
9,lda_pca1,-0.536806


In [12]:
# Spearman's Correlation Coefficient: P-Values

ps_df

Unnamed: 0,feature,gentrification_score
0,listings_count_log,9.066153e-11
1,agg_count_log,3.997119e-13
2,price_log,5.036532e-100
3,review_rating_location,8.523596e-144
4,square_feet_log,0.9514079
5,person_capacity,0.00474238
6,review_length_log,0.0845116
7,crime_words_perc_old,1.242196e-16
8,sentiment_neg_avg,5.81475e-10
9,lda_pca1,2.001852e-61
