In [2]:
%%time
import textacy
from django.db.models import F, Expression
from api.filters import random_sample, get_filter_query
from itertools import chain
import pickle

stop_words = pickle.load(open('../pickles/stopwords.p', 'rb'))

CPU times: user 766 ms, sys: 224 ms, total: 990 ms
Wall time: 2.72 s


In [49]:
filters = {
#    'max_sample_size': 5000,
    'numerical_range': [
        {
            'attribute_name': 'price',
            'min': None,
            'max': None
        },
        {
            'attribute_name': 'accommodates',
            'min': None,
            'max': None
        },
        {
            'attribute_name': 'estimated_revenue_per_month',
            'min': 1,
            'max': None
        }
    ],
    'region': {
        'region_type': 'neighborhood',
        'id': 106
    }
}
query = get_filter_query(filters)

In [104]:
%%time
# Build queryset and inverse queryset
queryset = Listing.objects.only('description').filter(query)
excluded = Listing.objects.only('description').exclude(id__in=(e['id'] for e in queryset.values('id')))

# Take a random sample of each
queryset = random_sample(queryset, 250)
excluded = random_sample(excluded, 100)

# Build input features
all_docs = list()
bools = list()
for l in chain(queryset, excluded):
    # Generate a list of lists of tokens, excluding stop words
    all_docs.append(filter(
        lambda w: w not in stop_words,
        textacy.preprocess_text(
            l.description, 
            lowercase=True, 
            no_punct=True, 
            no_numbers=True).split()
        )
    )
    bools.append(l in queryset)

    
    
    
###
###
### MUST DO SOME ERROR PREVENTION (EMPTY QUERYSETS, ETC.)

# Train model and output results
key_terms = textacy.keyterms.most_discriminating_terms(
    terms_lists=all_docs, bool_array_grp1=bools, top_n_terms=20)

CPU times: user 84.3 ms, sys: 6.97 ms, total: 91.3 ms
Wall time: 111 ms


In [105]:
key_terms

(['lax',
  'redondo',
  'torrance',
  'long',
  'miles',
  'harbor',
  'freeway',
  'properties',
  'medical',
  'gardena',
  'hermosa',
  'beach',
  'south',
  'bunk',
  'another',
  'mattresses',
  'student',
  'chilling',
  'indoors',
  'form'],
 ['hills',
  'private',
  'beverly',
  'location',
  'west',
  'guest',
  'unit',
  'heart',
  'grove',
  'king',
  'cable',
  'sunset',
  'shower',
  'fully',
  'building',
  'view',
  'beautiful',
  'brand',
  'throughout',
  'located'])

In [None]:
# Quick experiment: do weighted replication of words
# for online word cloud generator
duplicated_words = [
    (word + ' ') * (30-i)
    for i, word
    in enumerate(key_terms)
]
print(duplicated_words)

In [82]:
"""
Generating the doc_term_matrix takes a while. 
A pickled matrix from the full listing set is in ../pickles/
"""

#sample_listings = random_sample(Listing.objects.all(), 5000)
sample_listings = Listing.objects.all()

doc_stream = (
    textacy.Doc(textacy.preprocess_text(
        l.description, 
        lowercase=True, 
        no_punct=True, 
        no_numbers=True,
        no_contractions=True
    ))
    for l in sample_listings
    if textacy.text_utils.detect_language(l.description) == 'en' # English only; we don't have other analyzers available
)

corpus = textacy.Corpus('en', docs=doc_stream)

doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
    (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
        for doc in corpus),
    weighting='tfidf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95)



In [None]:
model = textacy.tm.TopicModel('nmf', n_topics=30) # non-negative matrix factorization method
model.fit(doc_term_matrix)



In [97]:
"""
Pickle the model if the topics are coherent!
"""

pickle.dump(model, open('../pickles/model.p', 'wb'))

In [98]:
# Print the topics
for topic_idx, top_terms in model.top_topic_terms(id2term, top_n=10):
    print('topic', topic_idx, ':', '   '.join(top_terms),'\n')

topic 0 : guest   stay   space   available   need   day   access   use   time   provide 

topic 1 : place   adventurer   solo   traveler   business   couple   will   good   love   close 

topic 2 : minute   number   drive   away   lax   walk   downtown   ride   freeway   five minute 

topic 3 : venice   kinney   beach   abbot   boardwalk   block   canal   bike   marina   abbott 

topic 4 : number   bedroom   bath   block   sleep   sq   ft   foot   car   unit 

topic 5 : apartment   bedroom   one   building   locate   spacious   entire   balcony   heart   complex 

topic 6 : view   ocean   mountain   deck   amazing   malibu   hill   sunset   balcony   canyon 

topic 7 : hollywood   fame   walk   blvd   chinese   west   sign   theatre   heart   theater 

topic 8 : beverly   hill   grove   west   rodeo   drive   city   century   center   ucla 

topic 9 : tv   wifi   free   cable   parking   dryer   include   washer   internet   kitchen 

topic 10 : room   share   bathroom   living   kitch

In [101]:
q=Listing.objects.filter(description__contains="park")

In [103]:
q.values_list('id', flat=True)

<QuerySet [8652973, 9250229, 10907712, 741734, 6171307, 5616987, 7132466, 8940686, 13310795, 33449, 420329, 3811175, 759229, 3522136, 1526602, 13403554, 8408103, 985865, 4519165, 502172, '...(remaining elements truncated)...']>