In [1]:
# unstructured_features.py -- Calculate Unstructured Features (LDA, D2V, LocationWords)

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import multiprocessing
import gensim
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import Counter

In [3]:
city = 'london'
reviews_file = '../../../Data/reviews/reviews_clean/reviews_clean_'+city+'.csv'
data = pd.read_csv(reviews_file)

In [4]:
data.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,year,language,reviews_clean
0,13913,4847959,2013-05-28,6405442,Vera,2013,en,happi alina guest great time london enjoy stay...
1,13913,8142329,2013-10-17,9195551,Honi,2013,en,stay alina flat london week wonder warm feel a...
2,13913,11876590,2014-04-17,5194009,Alessandro,2014,en,alina perfect guest flat absolut wonder high r...
3,13913,46669566,2015-09-12,42970248,Oleh,2015,en,alina flat except atmospher place flat plenti ...
4,13913,64559033,2016-03-05,45337884,Mo,2016,en,hous piec art beauti portrait close tube stati...


In [5]:
out_file = '../../../Data/reviews/reviews_data/reviews_data_'+city+'.csv'
output = pd.read_csv(out_file)

In [6]:
output.head()

Unnamed: 0,id,listing_id,year,month,day,sent_comp,sent_pos,sent_neg,sent_neu,lda1,lda2,lda3,lda4,lda5
0,4847959,13913,2013,5,28,0.9954,0.302,0.0,0.698,0.280112,0.0,0.069759,0.428841,0.21818
1,8142329,13913,2013,10,17,0.9623,0.272,0.0,0.728,0.0,0.0,0.086864,0.883986,0.0
2,11876590,13913,2014,4,17,0.8764,0.501,0.0,0.499,0.022306,0.022497,0.022692,0.910053,0.022453
3,46669566,13913,2015,9,12,0.9826,0.32,0.0,0.68,0.0,0.0,0.616192,0.243901,0.127574
4,64559033,13913,2016,3,5,0.9127,0.25,0.0,0.75,0.308462,0.011283,0.162193,0.506747,0.011315


In [7]:
print(len(data), len(output))

447979 447979


In [8]:
output = output[data['reviews_clean'].notna()]
data = data[data['reviews_clean'].notna()]

output = output.reset_index(drop=True)
data = data.reset_index(drop=True)

In [9]:
print(len(data), len(output))

447979 447979


In [10]:
data.to_csv(reviews_file, index=False)
output.to_csv(out_file, index=False)

# LDA

In [11]:
num_topics = 5

In [12]:
reviews = data['reviews_clean']
bow = []
for r in tqdm(reviews):
    bow.append(r.split())

100%|██████████| 447979/447979 [00:03<00:00, 118219.50it/s]


In [13]:
dictionary = gensim.corpora.Dictionary(bow)

# remove words appearing in less than 100 reviews
dictionary.filter_extremes(no_below=100)

In [14]:
bow_corpus = [dictionary.doc2bow(doc) for doc in bow]

In [15]:
# Run LDA
print('starting LDA')

lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = num_topics, id2word = dictionary, passes = 5, workers=2)

print('lda done')

starting LDA
lda done


In [16]:
for idx, topic in lda_model.print_topics(num_topics=-1, num_words=20):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

print()

Topic: 0 
Words: 0.057*"station" + 0.053*"walk" + 0.039*"minut" + 0.029*"london" + 0.026*"tube" + 0.025*"bus" + 0.021*"place" + 0.017*"close" + 0.017*"min" + 0.015*"5" + 0.015*"10" + 0.015*"train" + 0.015*"hous" + 0.014*"stay" + 0.012*"underground" + 0.012*"nice" + 0.010*"locat" + 0.010*"2" + 0.009*"clean" + 0.008*"citi"


Topic: 1 
Words: 0.058*"great" + 0.048*"locat" + 0.047*"nice" + 0.044*"good" + 0.044*"clean" + 0.038*"place" + 0.033*"stay" + 0.027*"room" + 0.025*"apart" + 0.023*"host" + 0.020*"easi" + 0.018*"flat" + 0.017*"communic" + 0.017*"close" + 0.015*"check" + 0.015*"comfort" + 0.015*"recommend" + 0.012*"help" + 0.012*"london" + 0.011*"transport"


Topic: 2 
Words: 0.029*"great" + 0.027*"flat" + 0.023*"locat" + 0.021*"restaur" + 0.021*"love" + 0.020*"stay" + 0.019*"london" + 0.018*"apart" + 0.015*"shop" + 0.014*"close" + 0.014*"area" + 0.013*"walk" + 0.012*"park" + 0.011*"perfect" + 0.010*"recommend" + 0.010*"street" + 0.010*"quiet" + 0.010*"place" + 0.009*"high" + 0.009*"co

In [None]:
print('getting scores per review')
all_scores = []

for i, review in tqdm(enumerate(bow_corpus)):
    scores = list(np.zeros(num_topics))
    for i in lda_model[review]:
        scores[i[0]] = i[1]
    all_scores.append(scores)

162it [00:00, 1613.76it/s]

getting scores per review


417637it [03:20, 2489.88it/s]

In [None]:
lda_cols = ['lda1', 'lda2', 'lda3', 'lda4', 'lda5']
for i in lda_cols:
    output[i] = np.nan

output.loc[:, lda_cols] = all_scores

In [None]:
output.head()

In [None]:
output.to_csv(out_file, index=False)

# D2V

In [8]:
# Set up reviews to map

process = data['reviews_clean']

reviews = []
for r in tqdm(process):
    reviews.append(r.split())

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews)]

100%|██████████| 447979/447979 [00:02<00:00, 182107.88it/s]


In [10]:
# Set up multithreading

cores = multiprocessing.cpu_count()
#assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

In [11]:
# Train Model

s=25

print('starting training')
model = Doc2Vec(documents, vector_size=s, window=3, min_count=100, workers=cores, epochs=10)
print('ending training')

starting training
ending training


In [12]:
# Record Mappings

vectors = []
for i in tqdm(range(len(data))):
    vectors.append(list(model.docvecs[i]))

100%|██████████| 447979/447979 [00:02<00:00, 155089.34it/s]


In [13]:
dtv_cols = []
for i in range(0, s):
    dtv_cols.append('dtv_'+str(i+1))

for i in dtv_cols:
    output[i] = np.nan
    
output.loc[:, dtv_cols] = vectors

In [14]:
output.head()

Unnamed: 0,id,listing_id,year,month,day,sent_comp,sent_pos,sent_neg,sent_neu,lda1,...,dtv_16,dtv_17,dtv_18,dtv_19,dtv_20,dtv_21,dtv_22,dtv_23,dtv_24,dtv_25
0,4847959,13913,2013,5,28,0.9954,0.302,0.0,0.698,0.280112,...,0.111499,-0.094669,0.220386,0.036887,-0.018792,0.1812,-0.015888,0.01792,-0.203862,0.132382
1,8142329,13913,2013,10,17,0.9623,0.272,0.0,0.728,0.0,...,0.064924,-0.051245,0.132313,0.067979,-0.129214,-0.045868,-0.067976,0.026742,-0.046301,0.108684
2,11876590,13913,2014,4,17,0.8764,0.501,0.0,0.499,0.022306,...,0.059626,-0.186169,0.043521,0.008656,-0.130722,-0.013105,0.074104,0.000356,0.016121,-0.009348
3,46669566,13913,2015,9,12,0.9826,0.32,0.0,0.68,0.0,...,-0.199562,-0.305227,0.053984,0.061121,0.051538,0.175705,0.12529,-0.15789,-0.132678,-0.249155
4,64559033,13913,2016,3,5,0.9127,0.25,0.0,0.75,0.308462,...,0.06533,-0.137538,-0.037017,-0.028817,-0.065899,0.225425,0.132233,0.04278,0.006165,-0.029153


In [15]:
output.to_csv(out_file, index=False)

# Words

In [16]:
crime_words = [
'Abuse', 
'Accomplice', 
'Accuse',
'Activists',
'Against',
'Aggravated',
'assault', 
'Alarm',
'Alert', 
'Allegation', 
'Ammunition', 
'APB', 
'Armed', 
'Arraignment', 
'Arrest',
'Arsenal', 
'Arson',
'Assailant',
'Assault', 
'Attack', 
'Autopsy',
'Bail', 
'Battery', 
'Beat', 
'Blackmail', 
'Blood',
'Bomb', 
'Brawl', 
'Breach', 
'Break',
'Bribe', 
'Brutal', 
'Bully',
'Burglary',
'Bystander',
'Capture', 
'Caution', 
'Coercion',
'Collusion', 
'Combat',
'Complain',
'Conspiracy', 
'Convict', 
'Cops', 
'Coroner', 
'Corrupt',
'Counterfeit', 
'CIA',
'Crime', 
'Criminal',
'Criminology',
'Cuffs',
'Custody',
'Damage',
'Danger',
'Dangerous', 
'Dead', 
'Death',
'Defense',
'Deputy',
'Detain', 
'Detective', 
'Disorderly', 
'Dispatch', 
'DNA', 
'Drugs', 
'Emergency', 
'Evasive', 
'Eviction', 
'Evil',
'Explosives', 
'Extradition', 
'Fatality', 
'FBI',
'Felony',
'Fight',
'Fingerprint', 
'Firebombing',
'Flee', 
'Forensics', 
'Forgery', 
'Fraud',
'Gory',
'Guard',
'Gun',
'Handcuffs',
'Harassment',
'Homeless',
'Harm', 
'Heinous',
'Hijack',
'Holster', 
'Homicide', 
'Hostage',
'Illegal', 
'Immoral', 
'Immunity', 
'Impeach', 
'Imprison',
'Incarceration',
'Incriminating', 
'Indictment', 
'Injury', 
'Inmate',
'Intruder',
'Invasive',
'Investigation',
'Jail',
'Juvenile',
'Kidnapping',
'Kill',
'Killer', 
'Larceny',
'Legal',
'Lynch',
'Mace',
'Malice',
'Malpractice',
'Manacled',
'Manslaughter',
'Misdemeanor',
'Murder',
'Murderer',
'911', 
'Offender',
'Offense',
'Officer',
'Patrol',
'Perjury',
'Perpetrator',
'Plea',
'Police', 
'Prison',
'Probation',
'Prosecute',
'Prosecutor',
'Prostitution',
'Radar', 
'Rape',
'Riot',
'Robbery',
'Rogue',
'Safe',
'Sanction',
'Sergeant',
'Shackles',
'Sheriff', 
'Shooting',
'Smuggling',
'Spying',
'Subpoena',
'Summons',
'Surveillance',
'Suspect',
'Suspicious',
'Terrorism',
'Theft',
'Threatening',
'Torture',
'Trauma',
'Unauthorized',
'Unlawful',
'Vagrancy',
'Vandalism',
'Victim',
'Violation',
'Violence',
'Warning',
'Weapon',
]

In [17]:
words_dict = {}
words_dict['crime_words'] = crime_words

In [18]:
words = pd.read_csv('airbnb_words.csv')

In [19]:
words.head()

Unnamed: 0,word,cat_lev1,cat_lev2,cat_lev3,ndf
0,information,business,professional_conduct_host,advice,0.008904
1,recommendations,business,professional_conduct_host,advice,0.008375
2,tips,business,professional_conduct_host,advice,0.008399
3,advice,business,professional_conduct_host,advice,0.007568
4,suggestions,business,professional_conduct_host,advice,0.00758


In [20]:
words_dict['location_words'] = list(words[words['cat_lev2']=='location']['word'].values)
words_dict['business_words'] = list(words[words['cat_lev1']=='business']['word'].values)
words_dict['social_words'] = list(words[words['cat_lev1']=='social']['word'].values)

In [21]:
from nltk.stem import SnowballStemmer
snowball=SnowballStemmer("english")

In [22]:
def stem(w):
    return snowball.stem(w)

In [23]:
for cat,words in words_dict.items():
    new_words = []
    for w in words:
        new_words.append(stem(w.lower()))
    words_dict[cat]=new_words

In [24]:
# Add Words Features

review_len = []
feature = {}
for cat in words_dict.keys():
    feature[cat] = []
    
for i,r in tqdm(data.iterrows()):
    bow = r['reviews_clean'].lower().split()
    review_len.append(len(bow))
    
    counts = Counter(bow)

    for cat,words in words_dict.items():
        total = 0
        for w in words:
            total += counts[w]
        feature[cat].append(total)

447979it [02:37, 2845.13it/s]


In [25]:
feature.keys()

dict_keys(['crime_words', 'location_words', 'business_words', 'social_words'])

In [26]:
output['review_len'] = review_len
for cat,counts in feature.items():
    output[cat] = counts
    output[cat+'_perc'] = output[cat]/output['review_len']

In [27]:
output.head()

Unnamed: 0,id,listing_id,year,month,day,sent_comp,sent_pos,sent_neg,sent_neu,lda1,...,dtv_25,review_len,crime_words,crime_words_perc,location_words,location_words_perc,business_words,business_words_perc,social_words,social_words_perc
0,4847959,13913,2013,5,28,0.9954,0.302,0.0,0.698,0.280112,...,0.132382,67,0,0.0,14,0.208955,31,0.462687,0,0.0
1,8142329,13913,2013,10,17,0.9623,0.272,0.0,0.728,0.0,...,0.108684,20,0,0.0,1,0.05,5,0.25,8,0.4
2,11876590,13913,2014,4,17,0.8764,0.501,0.0,0.499,0.022306,...,-0.009348,8,0,0.0,0,0.0,2,0.25,0,0.0
3,46669566,13913,2015,9,12,0.9826,0.32,0.0,0.68,0.0,...,-0.249155,32,0,0.0,1,0.03125,13,0.40625,0,0.0
4,64559033,13913,2016,3,5,0.9127,0.25,0.0,0.75,0.308462,...,-0.029153,19,0,0.0,4,0.210526,9,0.473684,4,0.210526


In [28]:
output.to_csv(out_file, index=False)