In [72]:
from __future__ import print_function
import numpy as np
import csv
import warnings
import nltk
import re
from bs4 import UnicodeDammit
warnings.filterwarnings("ignore") #just for some sklearn stuff.

In [83]:
words = set(nltk.corpus.words.words())


In [85]:
with open('/Users/talhabaig/cs4300sp2017-urban-sentiment/jsons/nyc_combination.csv', 'rb') as f:
    reader = csv.reader(f)
    ids = []
    docs = []
    neighborhoods = []
    for row in reader:
        ids.append(row[1])
        neighborhoods.append(row[2])
        doc = " ".join(w for w in nltk.wordpunct_tokenize(row[3]) if w.lower() in words or not w.isalpha())
        re.sub('\\.', '', doc)
        doc = doc + " the "; #smoothing
        dammit = UnicodeDammit(doc)
        docs.append(dammit.unicode_markup)



In [54]:
class Listing:

    def __init__(self, listing_id, neighborhood):
        self.id = listing_id
        self.neighborhood = neighborhood
        self.review = ""
    
    def addReview(self, review):
        self.review = self.review + " " + review
        

In [55]:
id_to_reviews = {}
for i in range(len(ids)):
    if ids[i] in id_to_reviews:
        id_to_reviews[ids[i]].addReview(docs[i])
    else:
        id_to_reviews[ids[i]] = Listing(ids[i], neighborhoods[i])

In [56]:
print(docs[:3])
print(neighborhoods[:10])

[u' the ', u'This was my first experience . I know I will do this again . I . It was exactly what I . And what a great neighborhood experience . Not only did Cottage have everything I , I being able to walk out in front of the home and see city and ! City Island is a real gem . I being able to walk up to and the market . Vegetarian , gluten free , no problem ! I mean everything I was right there . Nice liquor store also -- small but adequate . was more than patient as I an approval for reimbursement , holding the place for me even when she had another interested party . I even into the city via public transportation . ... always a colorful experience . really made this place welcoming and functional . the ', u'We had absolutely LOVELY stay at the Cozy Cottage . It was my first time in New York and I could not have chosen better place . Very bright , comfortable , cozy , clean and homey in a wonderful and beautiful location . Our host was super helpful and responsive . I would love to s

In [57]:
#ignore 'docs' and 'neighborhoods
docs = docs[1:]
neighborhoods= neighborhoods[1:]

In [58]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
cv = CountVectorizer(stop_words = 'english', max_df=.7, min_df=50,
                     max_features=5000)
counts = cv.fit_transform(docs)

In [61]:
print(counts.shape)

(632087, 5000)


In [60]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [62]:
n_topic = 10
model = LDA(n_topics=n_topic, max_iter=10, n_jobs=1, verbose=1)
res = model.fit_transform(counts)

In [63]:
feature_names = cv.get_feature_names()
print_top_words(model, feature_names, 20)

Topic #0:
apartment walking street good building distance bit night little times bed need square location space easy floor kitchen quiet local
Topic #1:
flexible loft village needs east thing exploring incredibly totally extremely pictured base thoughtful waiting half accommodating affordable meeting public gracious
Topic #2:
la el en es para lo se las bien al di estã si excelente barrio ha non era sin ben
Topic #3:
die ist wir war care man das life es hat sofa fact fã¼r super den nã eric um relaxed ich
Topic #4:
great stay apartment place recommend host really location new clean perfect york definitely comfortable time highly wonderful amazing helpful experience
Topic #5:
home time like felt stay feel day make thank check sure late experience did welcome family went night meet way
Topic #6:
room really walk nice subway just bathroom place away people bed bedroom kitchen house station train stayed living lot 10
Topic #7:
great place stay location apartment clean nice good host easy sub

In [73]:
print(model.components_.shape)

(10, 5000)


In [74]:
query = "young hip"
vec = cv.transform([query])
model.transform(vec)

array([[ 0.03334309,  0.03333333,  0.03333333,  0.03333333,  0.03333333,
         0.03334009,  0.36665013,  0.03333333,  0.36666669,  0.03333333]])

In [75]:
topics = [np.argmax(model.transform(doc)) for doc in counts]

In [82]:
neighborhoods

['neighbourhood']

In [76]:
from collections import defaultdict

In [87]:
topic_to_neighborhood = defaultdict(set)
for i in range(len(topics)):
    topic_to_neighborhood[topics[i]].add(neighborhoods[i])

In [88]:
print(topic_to_neighborhood[6])

set(['Ozone Park', 'Greenpoint', 'Financial District', 'Kensington', 'Mount Eden', 'Ridgewood', 'Cambria Heights', 'Concourse', 'South Ozone Park', 'Glen Oaks', 'Port Richmond', 'Cobble Hill', 'Inwood', 'Bay Terrace', 'Allerton', 'West Farms', 'Rego Park', 'Queens Village', 'Prospect-Lefferts Gardens', 'Rockaway Beach', 'Gramercy', 'Chelsea', 'Stapleton', 'North Riverdale', 'Astoria', 'Todt Hill', 'Clinton Hill', 'Gravesend', 'University Heights', 'SoHo', 'Port Morris', 'Great Kills', 'Westchester Square', 'Chinatown', 'Wakefield', 'Corona', 'West Village', 'East Elmhurst', 'Downtown Brooklyn', 'Dyker Heights', 'Upper West Side', 'Harlem', 'Bay Ridge', 'Two Bridges', 'Jamaica', 'Howard Beach', 'Highbridge', 'Brooklyn Heights', 'Belmont', 'Long Island City', 'Edenwald', 'East Village', 'Hunts Point', 'South Slope', 'Schuylerville', 'Lower East Side', 'NoHo', 'Bayside', 'Baychester', 'Pelham Bay', 'Upper East Side', 'Mariners Harbor', 'Fort Greene', 'Woodlawn', 'East New York', 'Melrose'

In [None]:
[print(len(topic_to_neighborhood[i])) for i in topic_to_neighborhood]

In [None]:
#topic_matrix
#word_to_top_index
#topic_to_neighborhood DONE

In [None]:
#topic matrix is one line of code
topic_matrix = model.components_

In [93]:
#all_words = set([w for w in doc.split() for doc in counts])
i = 0
all_words = set()
for doc in docs:
    for w in doc.split(" "):
        all_words.add(w)
    if i==1000:
        i = 0;
        print("1000 done")

In [95]:
print(all_words)



In [102]:
word_to_top_index = {} #{word:np.argmax(model.transform(word)) for word in all_words}
x = list(all_words)[:10]
model.transform(x[3])
for word in all_words:
    vec = model.transform(word)
    ind = np.argmax(word);
    word_to_top_index[word] = ind;

AttributeError: 'bool' object has no attribute 'any'

In [None]:
with open('neighborhood_to_listing_ids.json', 'w') as fp:
        json.dump(neighborhood_to_listing_ids, fp)

In [None]:
with open('neighborhood_to_listing_ids.json', 'w') as fp:
        json.dump(neighborhood_to_listing_ids, fp)

In [None]:
json.dump(tfidf_compressed, open('airbnb_tfidf_compressed.json', 'w'), cls=NumpyEncoder)