In [57]:
%%time
import textacy
from django.db.models import F, Expression
from api.filters import random_sample, get_filter_query
from itertools import chain
import pickle

stop_words = pickle.load(open('../pickles/stopwords.p', 'rb'))

CPU times: user 491 µs, sys: 724 µs, total: 1.21 ms
Wall time: 22.2 ms


In [49]:
filters = {
#    'max_sample_size': 5000,
    'numerical_range': [
        {
            'attribute_name': 'price',
            'min': None,
            'max': None
        },
        {
            'attribute_name': 'accommodates',
            'min': None,
            'max': None
        },
        {
            'attribute_name': 'estimated_revenue_per_month',
            'min': 1,
            'max': None
        }
    ],
    'region': {
        'region_type': 'neighborhood',
        'id': 106
    }
}
query = get_filter_query(filters)

In [50]:
%%time
# Build queryset and inverse queryset
queryset = Listing.objects.only('description').filter(query)
excluded = Listing.objects.only('description').exclude(id__in=(e['id'] for e in queryset.values('id')))

# Take a random sample of each
queryset = random_sample(queryset, 250)
excluded = random_sample(excluded, 100)

# Build input features
all_docs = list()
bools = list()
for l in chain(queryset, excluded):
    # Generate a list of lists of tokens, excluding stop words
    all_docs.append(filter(
        lambda w: w not in stop_words,
        textacy.preprocess_text(
            l.description, 
            lowercase=True, 
            no_punct=True, 
            no_numbers=True).split()
        )
    )
    bools.append(l in queryset)

    
    
    
###
###
### MUST DO SOME ERROR PREVENTION (EMPTY QUERYSETS, ETC.)

# Train model and output results
key_terms = textacy.keyterms.most_discriminating_terms(
    terms_lists=all_docs, bool_array_grp1=bools, top_n_terms=20)[0]

CPU times: user 91.7 ms, sys: 1.21 ms, total: 92.9 ms
Wall time: 111 ms


In [51]:
key_terms.__repr__()

"['lax', 'redondo', 'torrance', 'long', 'miles', 'harbor', 'freeway', 'properties', 'medical', 'gardena', 'hermosa', 'beach', 'south', 'bunk', 'another', 'mattresses', 'student', 'chilling', 'indoors', 'form']"

In [None]:
# Quick experiment: do weighted replication of words
# for online word cloud generator
duplicated_words = [
    (word + ' ') * (30-i)
    for i, word
    in enumerate(key_terms)
]
print(duplicated_words)

In [82]:
"""
Generating the doc_term_matrix takes a while. 
A pickled matrix from the full listing set is in ../pickles/
"""

#sample_listings = random_sample(Listing.objects.all(), 5000)
sample_listings = Listing.objects.all()

doc_stream = (
    textacy.Doc(textacy.preprocess_text(
        l.description, 
        lowercase=True, 
        no_punct=True, 
        no_numbers=True,
        no_contractions=True
    ))
    for l in sample_listings
    if textacy.text_utils.detect_language(l.description) == 'en' # English only; we don't have other analyzers available
)

corpus = textacy.Corpus('en', docs=doc_stream)

doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
    (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
        for doc in corpus),
    weighting='tfidf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95)



In [96]:
model = textacy.tm.TopicModel('nmf', n_topics=30) # non-negative matrix factorization method
model.fit(doc_term_matrix)

for topic_idx, top_terms in model.top_topic_terms(id2term, top_n=12):
    print('topic', topic_idx, ':', '   '.join(top_terms),'\n')

topic 0 : guest   stay   space   available   need   day   access   use   time   provide   like   feel 

topic 1 : place   adventurer   solo   traveler   business   couple   will   good   love   close   kid   family 

topic 2 : minute   number   drive   away   lax   walk   downtown   ride   freeway   five minute   la   numbernumber 

topic 3 : venice   kinney   beach   abbot   boardwalk   block   canal   bike   marina   abbott   del   famous 

topic 4 : number   bedroom   bath   block   sleep   sq   ft   foot   car   unit   people   numbernumber 

topic 5 : apartment   bedroom   one   building   locate   spacious   entire   balcony   heart   complex   two   living 

topic 6 : view   ocean   mountain   deck   amazing   malibu   hill   sunset   balcony   canyon   city   spectacular 

topic 7 : hollywood   fame   walk   blvd   chinese   west   sign   theatre   heart   theater   sunset   runyon 

topic 8 : beverly   hill   grove   west   rodeo   drive   city   century   center   ucla   loca

In [97]:
"""
Pickle the model if the topics are coherent!
"""

pickle.dump(model, open('../pickles/model.p', 'wb'))