In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import geopandas as gpd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
%matplotlib inline 


In [48]:
reviews = pd.read_csv('data/yelp.csv', index_col=0)



In [49]:
#this column is empty 
reviews.drop('business_neighborhoods', axis=1, inplace=True)

In [50]:
#i want to only deal with restaurants 
reviews = reviews[reviews['business_categories'].str.contains('Restaurant') == True]

In [51]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158430 entries, 0 to 229905
Data columns (total 30 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   business_blank          158430 non-null  bool   
 1   business_categories     158430 non-null  object 
 2   business_city           158430 non-null  object 
 3   business_full_address   158430 non-null  object 
 4   business_id             158430 non-null  object 
 5   business_latitude       158430 non-null  float64
 6   business_longitude      158430 non-null  float64
 7   business_name           158430 non-null  object 
 8   business_open           158430 non-null  bool   
 9   business_review_count   158430 non-null  int64  
 10  business_stars          158430 non-null  float64
 11  business_state          158430 non-null  object 
 12  business_type           158430 non-null  object 
 13  cool                    158430 non-null  int64  
 14  date                

In [52]:
#there are six entries with no text review, going to drop 
no_rev = reviews[pd.isna(res['text']) == True].index 
reviews.drop(labels=no_rev, axis=0, inplace=True)

In [53]:
reviews = reviews.reset_index().drop('index', axis=1)

In [54]:
reviews.head()

Unnamed: 0,business_blank,business_categories,business_city,business_full_address,business_id,business_latitude,business_longitude,business_name,business_open,business_review_count,...,reviewer_funny,reviewer_name,reviewer_review_count,reviewer_type,reviewer_useful,stars,text,type,useful,user_id
0,False,Breakfast & Brunch; Restaurants,Phoenix,"6106 S 32nd St\nPhoenix, AZ 85042",9yKzy9PApeiPPOUJEtnvkg,33.390792,-112.012504,Morning Glory Cafe,True,116,...,331,Jason,376,user,1034,5,My wife took me here on my birthday for breakf...,review,5,rLtl8ZkDX5vH5nAx9C3q5Q
1,False,Italian; Pizza; Restaurants,Phoenix,"4848 E Chandler Blvd\nPhoenix, AZ 85044",ZRJwVLyzEJq1VAihDhYiow,33.305607,-111.978758,Spinato's Pizzeria,True,102,...,2,Paul,2,user,0,5,I have no idea why some people give bad review...,review,0,0a2KyEL0d3Yb1V6aivbIuQ
2,False,Middle Eastern; Restaurants,Tempe,"1513 E Apache Blvd\nTempe, AZ 85281",6oRAC4uyJCsJl1X0WZpVSA,33.414345,-111.913031,Haji-Baba,True,265,...,0,Nicole,3,user,3,4,love the gyro plate. Rice is so good and I als...,review,1,0hT2KtfLiobPvh6cDC8JQg
3,False,Wine Bars; Bars; American (New); Nightlife; Re...,Phoenix,"6106 S 32nd St\nPhoenix, AZ 85042",-yxfBYGB6SEqszmxJxd97A,33.390792,-112.012504,Quiessence Restaurant,True,109,...,743,Deborah,654,user,1584,4,"Quiessence is, simply put, beautiful. Full wi...",review,3,sqYN3lNgvPbPCTRsMFu27g
4,False,Mexican; Restaurants,Phoenix,"1919 N 16th St\nPhoenix, AZ 85006",zp713qNhx8d9KCJJnrw1xA,33.469132,-112.047512,La Condesa Gourmet Taco Shop,True,307,...,1187,Monique,295,user,1376,5,Drop what you're doing and drive here. After I...,review,7,wFweIWhv2fREZV_dYkz_1g


In [55]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorized = vectorizer.fit_transform(reviews.text)
        

In [56]:
len(vectorized.toarray()[0])

89334

In [23]:
len(vectorized.toarray())

158424

In [57]:
vectorizer.get_feature_names()
#doesn't look right...i may need to clean up and lemmatize text manually before vectorizing

['00',
 '000',
 '00000001',
 '0005',
 '000mg',
 '000s',
 '000th',
 '000x',
 '0010to',
 '001cc4c03286',
 '003',
 '00400000019838',
 '007',
 '00a',
 '00am',
 '00ish',
 '00p',
 '00pm',
 '00s',
 '01',
 '01pm',
 '02',
 '0228tr',
 '02pm',
 '03',
 '03342',
 '0399',
 '03am',
 '04',
 '040',
 '0468',
 '04may2012',
 '04pm',
 '05',
 '050d_xior1npcuwkbivaq',
 '05766',
 '05am',
 '05p',
 '05pm',
 '06',
 '0600',
 '0613phoavina',
 '0630',
 '07',
 '0730',
 '0745',
 '08',
 '0816',
 '08pm',
 '09',
 '0911mexdiningcheap',
 '0920',
 '095468391',
 '09a',
 '09pm',
 '09rhdgad99fo13b3n7l_ea',
 '0_0',
 '0_o',
 '0a',
 '0buxoc0crqjpvkezo3bqog',
 '0cphorlomtosqw1erxb6ta',
 '0evnrlfd3apsqzt0tnaabq',
 '0fncsakdjopoaywsfc8x5w',
 '0gxylvpnwz0wt8wxqvps0g',
 '0hm9bfms7oiuo2zp98tl1w',
 '0l6q',
 '0lcjnbhwgxs6zmtp_137ea',
 '0mzuic2rg7fhoswuzwsqsg',
 '0o4t4yi1gd10so4wnhcera',
 '0tfejh3dynwqgckjidbtuw',
 '0tond6okdew3p7u6osq8ew',
 '0tzg',
 '0x',
 '0yg5wa2eyyqsw',
 '0yymd5auwms',
 '0z',
 '10',
 '100',
 '1000',
 '10000',
 '10000

In [58]:
lem = WordNetLemmatizer()
stopwords = stopwords.words('english')

AttributeError: 'list' object has no attribute 'words'

In [64]:
texts = []
for rev in reviews['text'].values:
    review = [lem.lemmatize(word).lower() for word in rev.split(' ') \
              if word not in stopwords and word.isalpha()==True]
    texts.append(" ".join(review))

'my wife took birthday breakfast the weather perfect made sitting outside overlooking ground absolute our waitress excellent food arrived quickly saturday it looked like place fill pretty quickly earlier get favor get bloody it phenomenal simply best ever pretty sure use ingredient garden blend fresh order it everything menu look i white truffle scrambled egg vegetable skillet tasty it came piece griddled bread amazing absolutely made meal it best ever i wait go'