In [None]:
# File: dtv.ipynb -- Doc2Vec
# Author: Shomik Jain
# Date: 2/02/2020

In [None]:
import pandas as pd
import multiprocessing
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import gensim

In [None]:
file = '../../../../Google Drive File Stream/My Drive/Airbnb/New_York_Data/nyc_reviews.tsv'
actual_data = pd.read_csv(file, delimiter='\t', quotechar='"', escapechar='\\')

#actual_data = actual_data[actual_data['year']==2017]
actual_data = actual_data.dropna(subset=['reviews_clean'])
actual_data = actual_data.reset_index(drop=True)
print(len(actual_data))

In [None]:
file = '../../../../Google Drive File Stream/My Drive/Airbnb/New_York_Data/nyc_zipcode_all.tsv'
actual_zipcode = pd.read_csv(file, delimiter='\t', quotechar='"', escapechar='\\')

#actual_zipcode = actual_zipcode[actual_zipcode['year']==2017]
#actual_zipcode = actual_zipcode.reset_index(drop=True)
print(len(actual_zipcode))

In [None]:
# Set up reviews to map
actual_data = actual_data.dropna(subset=['reviews_clean'])
actual_data = actual_data.reset_index(drop=True)

process = actual_data['reviews_clean']

reviews = []
for r in process:
    reviews.append(r.split())

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews)]

In [None]:
# Set up multithreading

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [None]:
# Train Model and Record Mappings
data = actual_data.copy()
zipcode = actual_zipcode.copy()

s=25

print('starting training')
model = Doc2Vec(documents, vector_size=s, window=3, min_count=100, workers=cores, epochs=10)
print('ending training')

vectors = []
for i in range(0, len(data)):
    vectors.append(list(model.docvecs[i]))

dtv_cols = []
for i in range(0, s):
    dtv_cols.append('dtv_'+str(i))

In [None]:
# Average Over Zipcodes 

all_cols = ['zipcode', 'year'] + dtv_cols

for i in dtv_cols:
    data[i] = np.nan
data.loc[:, dtv_cols] = vectors

answer = pd.DataFrame(columns=all_cols)

unique_years = zipcode['year'].unique()
unique_zipcodes = zipcode['zipcode'].unique()

for z in unique_zipcodes:
    for y in unique_years:
        curr = data.loc[(data['zipcode']==z) & (data['year']==y)]

        new = {}
        new['year'] = y
        new['zipcode'] = z

        for c in dtv_cols:
            new[c] = np.nanmean(curr[c])

        answer = answer.append(new, ignore_index=True)

answer['zipcode'] = pd.to_numeric(answer['zipcode'], downcast='integer')
zipcode['zipcode'] = pd.to_numeric(zipcode['zipcode'], downcast='integer')
answer['year'] = pd.to_numeric(answer['year'], downcast='integer')
zipcode['year'] = pd.to_numeric(zipcode['year'], downcast='integer')
zipcode = pd.merge(zipcode, answer, how='left', on=['zipcode', 'year'])

In [None]:
# Compress to 2 Dimensions

to_compress = zipcode[dtv_cols]
to_compress = to_compress.values

pca = PCA(n_components=2)
compress = pca.fit_transform(to_compress)

print('pca variance', pca.explained_variance_ratio_)  

pca1 = [i[0] for i in compress]
pca2 = [i[1] for i in compress]

zipcode.loc[:, 'pca1'] = pca1
zipcode.loc[:, 'pca2'] = pca2