In [1]:
%%time
import textacy
from django.db.models import F, Expression
from api.filters import random_sample, get_filter_query
from itertools import chain
import pickle

stop_words = pickle.load(open('../pickles/stopwords.p', 'rb'))

CPU times: user 1.24 s, sys: 358 ms, total: 1.6 s
Wall time: 2.21 s


In [None]:
filters = {
#    'max_sample_size': 5000,
    'numerical_range': [
        {
            'attribute_name': 'price',
            'min': None,
            'max': None
        },
        {
            'attribute_name': 'accommodates',
            'min': None,
            'max': None
        },
        {
            'attribute_name': 'estimated_revenue_per_month',
            'min': 1,
            'max': None
        }
    ],
    'region': {
        'region_type': 'neighborhood',
        'id': 106
    }
}
query = get_filter_query(filters)

In [None]:
%%time
# Build queryset and inverse queryset
queryset = Listing.objects.only('description').filter(query)
excluded = Listing.objects.only('description').exclude(id__in=(e['id'] for e in queryset.values('id')))

# Take a random sample of each
queryset = random_sample(queryset, 250)
excluded = random_sample(excluded, 100)

# Build input features
all_docs = list()
bools = list()
for l in chain(queryset, excluded):
    # Generate a list of lists of tokens, excluding stop words
    all_docs.append(filter(
        lambda w: w not in stop_words,
        textacy.preprocess_text(
            l.description, 
            lowercase=True, 
            no_punct=True, 
            no_numbers=True).split()
        )
    )
    bools.append(l in queryset)

    
    
    
###
###
### MUST DO SOME ERROR PREVENTION (EMPTY QUERYSETS, ETC.)

# Train model and output results
key_terms = textacy.keyterms.most_discriminating_terms(
    terms_lists=all_docs, bool_array_grp1=bools, top_n_terms=20)

In [None]:
key_terms

In [None]:
# Quick experiment: do weighted replication of words
# for online word cloud generator
duplicated_words = [
    (word + ' ') * (30-i)
    for i, word
    in enumerate(key_terms)
]
print(duplicated_words)

In [4]:
"""
Generating the doc_term_matrix takes a while. 
A pickled matrix from the full listing set is in ../pickles/
"""

#sample_listings = random_sample(Listing.objects.all(), 5000)
listing_df = pickle.load(open('../pickles/listings_dataframe.p', 'rb'))
english_listing_ids = english_listing_ids = list(listing_df[listing_df.is_english].id)
english_listings = Listing.objects.filter(id__in=english_listing_ids).order_by('id')

doc_stream = (
    textacy.Doc(textacy.preprocess_text(
        l.description, 
        lowercase=True, 
        no_punct=True, 
        no_numbers=True,
        no_contractions=True
    ))
    for l in english_listings
)

corpus = textacy.Corpus('en', docs=doc_stream)

doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
    (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
        for doc in corpus),
    weighting='tfidf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95)

In [9]:
model = textacy.tm.TopicModel('nmf', n_topics=50) # non-negative matrix factorization method
model.fit(doc_term_matrix)



In [16]:
"""
Pickle everything if the topics are coherent!
"""
pickle_obj = {
    'model': model,
    'id2term': id2term,
    'doc_term_matrix': doc_term_matrix,
    'doc_topic_matrix': model.get_doc_topic_matrix(doc_term_matrix)
}

pickle.dump(pickle_obj, open('../pickles/topic_model_with_extras.p', 'wb'))

  return doc_topic_matrix / np.sum(doc_topic_matrix, axis=1, keepdims=True)


In [10]:
# Print the topics
for topic_idx, top_terms in model.top_topic_terms(id2term, top_n=10):
    print('topic', topic_idx, ':', '   '.join(top_terms),'\n')

topic 0 : bedroom   master   bathroom   large   spacious   balcony   living   bath   2nd   closet 

topic 1 : place   will   love   close   ambiance   outdoors   coziness   comfy   people   adventurer 

topic 2 : santa   monica   3rd   promenade   pier   ucla   beach   brentwood   westwood   street 

topic 3 : venice   kinney   abbot   beach   boardwalk   canal   block   bike   abbott   famous 

topic 4 : number   bath   sleep   bedroom   ft   sq   foot   block   people   car 

topic 5 : apartment   building   locate   entire   bedroom   spacious   heart   complex   one   balcony 

topic 6 : view   ocean   mountain   deck   amazing   malibu   balcony   hill   canyon   spectacular 

topic 7 : hollywood   fame   chinese   walk   theatre   sign   west   theater   heart   attraction 

topic 8 : beverly   hill   rodeo   drive   west   ucla   century   westwood   dr   hollywood 

topic 9 : parking   free   street   wifi   available   spot   car   garage   laundry   plenty 

topic 10 : room  

In [59]:
# Build dataframe with listing id-topic info
columns = ['id']
for i in range(0, model.n_topics):
    columns.append('topic_%d' % i)
data = list()
for i, topic_array in enumerate(doc_topic_matrix):
    row = [english_listing_ids[i],] + list(topic_array)
    data.append(row)
listing_topic_df = pd.DataFrame(data=data, columns=columns)
listing_topic_df.fillna(0) # Some docs don't hit any topics; let's just fill with 0

pickle.dump(listing_topic_df, open('../pickles/listing_topic_df.p', 'wb'))

Unnamed: 0,id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_40,topic_41,topic_42,topic_43,topic_44,topic_45,topic_46,topic_47,topic_48,topic_49
0,2949716,0.000000,0.000000,0.000000,0.000000,0.010969,0.066518,0.000000,0.000000,0.033488,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.021859
1,1314036,0.000000,0.000000,0.000000,0.106270,0.107532,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.112288,0.000000,0.000000,0.000000,0.000000,0.012813,0.000000
2,5115184,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,12739885,0.000000,0.000000,0.091055,0.004322,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.005290,0.134957,0.000000,0.096886,0.000000,0.000000,0.000000,0.061470
4,9954642,0.000000,0.036089,0.000000,0.000000,0.046919,0.000000,0.000000,0.019651,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.104585,0.081142,0.066066
5,6779990,0.000000,0.000000,0.000000,0.000000,0.014438,0.000000,0.006018,0.156261,0.118923,...,0.000000,0.000000,0.025567,0.000000,0.085798,0.000000,0.000000,0.000000,0.021371,0.046629
6,13247489,0.017993,0.000000,0.000000,0.000000,0.076444,0.000000,0.000100,0.034057,0.023838,...,0.000000,0.000000,0.000000,0.000000,0.088253,0.000000,0.083100,0.000000,0.000000,0.000000
7,7619364,0.000000,0.000000,0.000000,0.000000,0.013434,0.000256,0.000000,0.018245,0.026946,...,0.038504,0.000000,0.018151,0.000000,0.000000,0.170479,0.027927,0.000000,0.000677,0.000000
8,8559784,0.025415,0.010797,0.076836,0.033126,0.130678,0.020336,0.000000,0.000000,0.000000,...,0.084603,0.000000,0.017333,0.004730,0.016521,0.083573,0.000000,0.011599,0.010154,0.039756
9,13138428,0.000000,0.041294,0.000000,0.000000,0.000000,0.000000,0.000000,0.016339,0.001428,...,0.019614,0.000000,0.000000,0.000000,0.000000,0.000000,0.026137,0.000000,0.029246,0.000000
