# YELP

In [2]:
#Importamos las librerías necesarias
import pandas as pd
import json
import re
import folium
from folium.plugins import HeatMap

## Businesses

In [3]:
#Leemos el archivo .pkl 
df_business=pd.read_pickle('Data/Yelp/business.pkl')

In [4]:
#Observamos el contenido del dataframe
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,...,,,,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,...,,,,,,,,,,
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,...,,,,,,,,,,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,...,,,,,,,,,,
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,...,,,,,,,,,,


In [5]:
#Observamos la información del dataframe
df_business.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150346 entries, 0 to 150345
Data columns (total 28 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   business_id   150346 non-null  object
 1   name          150346 non-null  object
 2   address       150346 non-null  object
 3   city          150346 non-null  object
 4   state         150343 non-null  object
 5   postal_code   150346 non-null  object
 6   latitude      150346 non-null  object
 7   longitude     150346 non-null  object
 8   stars         150346 non-null  object
 9   review_count  150346 non-null  object
 10  is_open       150346 non-null  object
 11  attributes    136602 non-null  object
 12  categories    150243 non-null  object
 13  hours         127123 non-null  object
 14  business_id   5 non-null       object
 15  name          5 non-null       object
 16  address       5 non-null       object
 17  city          5 non-null       object
 18  state         5 non-null     

In [6]:
#Revisaremos los nulos
df_business.isnull().sum()

business_id          0
name                 0
address              0
city                 0
state                3
postal_code          0
latitude             0
longitude            0
stars                0
review_count         0
is_open              0
attributes       13744
categories         103
hours            23223
business_id     150341
name            150341
address         150341
city            150341
state           150341
postal_code     150341
latitude        150341
longitude       150341
stars           150341
review_count    150341
is_open         150341
attributes      150341
categories      150341
hours           150341
dtype: int64

In [7]:
#Eliminamos las columnas que se encuentran vacías y duplicadas, quedandome con la primera (que es la que tiene más datos)
df_business = df_business.loc[:, ~df_business.columns.duplicated(keep='first')]

In [8]:
#Verificamos que se hallan eliminado correctamente
df_business.isnull().sum()

business_id         0
name                0
address             0
city                0
state               3
postal_code         0
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      13744
categories        103
hours           23223
dtype: int64

In [9]:
#Identificamos los estados de la columna state
df_business['state'].unique()

array([nan, 'CA', 'MO', 'AZ', 'PA', 'TN', 'FL', 'IN', 'LA', 'AB', 'NV',
       'ID', 'DE', 'IL', 'NJ', 'NC', 'CO', 'WA', 'HI', 'UT', 'TX', 'MT',
       'MI', 'SD', 'XMS', 'MA', 'VI', 'VT'], dtype=object)

In [10]:
#Identificamos los estados faltantes a través de las ciudades, verificamos si pertenecen al estado de Florida
df_business[df_business['state'].isnull()]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."


Ahora seleccionamos la información del estado de Florida. <br>

En una primera instancia se decide filtrar por state=FL, pero se verifica que hay ciudades que no pertenecen al estado de Florida (por error en la columna estado). Por lo tanto, se decide buscar los códigos postales del Estado de Florida, a partir de esta fuente: https://xn--cdigos-postales-vrb.cybo.com/estados-unidos/florida/?p=3 y observamos que los valores van entre 32822 y 34997. Procedemos a filtrar el código postal.

In [11]:
#Primero, verificamos el tipo de dato de la columna postal_code
df_business['postal_code'].info()

<class 'pandas.core.series.Series'>
Index: 150346 entries, 0 to 150345
Series name: postal_code
Non-Null Count   Dtype 
--------------   ----- 
150346 non-null  object
dtypes: object(1)
memory usage: 2.3+ MB


In [12]:
#Cambiamos el tipo de dato de la columna postal_code a string (se prueba pasar a entero pero hay str en la columna para otros estados)
df_business = df_business.copy()
df_business['postal_code']=df_business['postal_code'].astype(str)

In [13]:
#Filtramos el dataframe con el rango de códigos postales para quedarme con los del Estado de Florida.
df_business_florida = df_business[(df_business['postal_code'] >= '32822') & (df_business['postal_code'] <= '34997')]
df_business_florida

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
7,qkRM_2X51Yqxk3btlwAQIg,Temple Beth-El,400 Pasadena Ave S,St. Petersburg,PA,33707,27.76659,-82.732983,3.5,5,1,,"Synagogues, Religious Organizations","{'Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ..."
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.45738,3.5,6,1,"{'RestaurantsPriceRange2': '2', 'BikeParking':...","Department Stores, Shopping, Fashion","{'Monday': '9:30-21:30', 'Tuesday': '9:30-21:3..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,MO,33602,27.955269,-82.45632,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
13,jaxMSoInw8Poo3XeMJt8lQ,Adams Dental,15 N Missouri Ave,Clearwater,FL,33755,27.966235,-82.787412,5.0,10,1,{'ByAppointmentOnly': 'True'},"General Dentistry, Dentists, Health & Medical,...","{'Monday': '7:30-15:30', 'Tuesday': '7:30-15:3..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150280,37G3SzO7RS1qSthACCG5SQ,Tampa Bay Club Sport,380 105th Terrace Ne,Saint Petersburg,TN,33716,27.867754,-82.632602,2.5,7,0,{'GoodForKids': 'True'},"Active Life, Sports Clubs",
150289,Fck8i0fNQCa22ERz5Fa21w,Thoughtful Moving,5004 E Fowler Ave,Tampa,PA,33617,28.054934,-82.400832,2.0,27,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Packing Services, Home Services, Movers, Local...","{'Monday': '22:0-22:30', 'Tuesday': '8:0-19:0'..."
150292,esBGrrmuZzSiECyRBoKvvA,Colony Grill - St. Petersburg,670 Central Ave,St. Petersburg,FL,33701,27.770872,-82.643069,4.5,38,1,"{'RestaurantsPriceRange2': '2', 'RestaurantsAt...","Bars, Beer Bar, Nightlife, Wine Bars, Pizza, R...","{'Monday': '11:30-23:0', 'Tuesday': '11:30-23:..."
150317,Q7JYAMNzI1IpUd2edflmTA,21 Barber,10937 56th St N,Temple Terrace,NJ,33617,28.047632,-82.393519,4.5,18,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Men's Hair Salons, Hair Salons, Barbers, Beaut...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [14]:
#Revisamos las dimensiones del dataframe
df_business_florida.shape

(26322, 14)

Vamos a verificar a partir de los datos de longitud y latitud que todos los registros seleccionamos se encuentran en Florida. 

In [15]:
# Creamos un mapa
m = folium.Map(location=[df_business_florida['latitude'].mean(), df_business_florida['longitude'].mean()], zoom_start=10)

# Creamos la lista de coordenadas
locations = list(zip(df_business_florida['latitude'], df_business_florida['longitude']))

# lo agregamos al mapa de calor
HeatMap(locations).add_to(m)

# Mostrar el mapa
m.save('heatmap_florida.html')  # Guardamos el mapa como archivo HTML
m

Verificamos por el mapa que hay algunos datos que si bien el código postal pertenece a Florida, hay errores en los valores de longitud y latitude, por lo que se procede a eliminar esos 3 valores.
Vemos que la longitud toma los valores: -86.651376442, -86.3266351 y -75.480487.
Por lo tanto elijo quedarme con el rango de valores de Florida que es: -82.5939456 y -82.851044

In [16]:
df_business_florida = df_business_florida[df_business_florida['longitude'] !=-86.651376442]

In [17]:
df_business_florida = df_business_florida[df_business_florida['longitude'] !=-86.3266351]

In [18]:
df_business_florida = df_business_florida[df_business_florida['longitude'] !=-75.480487]

In [19]:
df_business_florida 

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
7,qkRM_2X51Yqxk3btlwAQIg,Temple Beth-El,400 Pasadena Ave S,St. Petersburg,PA,33707,27.76659,-82.732983,3.5,5,1,,"Synagogues, Religious Organizations","{'Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ..."
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.45738,3.5,6,1,"{'RestaurantsPriceRange2': '2', 'BikeParking':...","Department Stores, Shopping, Fashion","{'Monday': '9:30-21:30', 'Tuesday': '9:30-21:3..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,MO,33602,27.955269,-82.45632,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
13,jaxMSoInw8Poo3XeMJt8lQ,Adams Dental,15 N Missouri Ave,Clearwater,FL,33755,27.966235,-82.787412,5.0,10,1,{'ByAppointmentOnly': 'True'},"General Dentistry, Dentists, Health & Medical,...","{'Monday': '7:30-15:30', 'Tuesday': '7:30-15:3..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150280,37G3SzO7RS1qSthACCG5SQ,Tampa Bay Club Sport,380 105th Terrace Ne,Saint Petersburg,TN,33716,27.867754,-82.632602,2.5,7,0,{'GoodForKids': 'True'},"Active Life, Sports Clubs",
150289,Fck8i0fNQCa22ERz5Fa21w,Thoughtful Moving,5004 E Fowler Ave,Tampa,PA,33617,28.054934,-82.400832,2.0,27,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Packing Services, Home Services, Movers, Local...","{'Monday': '22:0-22:30', 'Tuesday': '8:0-19:0'..."
150292,esBGrrmuZzSiECyRBoKvvA,Colony Grill - St. Petersburg,670 Central Ave,St. Petersburg,FL,33701,27.770872,-82.643069,4.5,38,1,"{'RestaurantsPriceRange2': '2', 'RestaurantsAt...","Bars, Beer Bar, Nightlife, Wine Bars, Pizza, R...","{'Monday': '11:30-23:0', 'Tuesday': '11:30-23:..."
150317,Q7JYAMNzI1IpUd2edflmTA,21 Barber,10937 56th St N,Temple Terrace,NJ,33617,28.047632,-82.393519,4.5,18,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Men's Hair Salons, Hair Salons, Barbers, Beaut...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


Volvemos a graficar

In [20]:
# Creamos un mapa
m = folium.Map(location=[df_business_florida['latitude'].mean(), df_business_florida['longitude'].mean()], zoom_start=10)

# Creamos la lista de coordenadas
locations = list(zip(df_business_florida['latitude'], df_business_florida['longitude']))

# lo agregamos al mapa de calor
HeatMap(locations).add_to(m)

# Mostrar el mapa
m.save('heatmap_florida.html')  # Guardamos el mapa como archivo HTML
m

In [21]:
#Voy a setear todos los valores de la columna state como FL.
df_business_florida['state'] = 'FL'

In [22]:
#Volvemos a revisar los nulos
df_business_florida.isnull().sum()

business_id        0
name               0
address            0
city               0
state              0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
is_open            0
attributes      2592
categories        21
hours           3445
dtype: int64

In [23]:
#Me interesa conocer la categoría del negocio (restaurantes) por lo que eliminaré los valores donde no tengo información de la categoría
df_business_florida[df_business_florida['categories'].isnull()]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3324,_obl2-rphXvtzP3y_ekV1Q,Certegy Payment Services,11601 Roosevelt Blvd N,Saint Petersburg,FL,33716,27.877463,-82.653546,1.0,7,1,,,
13023,cs7i8-NtrT2P4dMYa2fX-g,Direct USA,"6201 Johns Rd, Ste 12",Tampa,FL,33634,28.007857,-82.549116,1.0,5,1,,,
15060,NpQowTAUYGeylRCAsJx9UA,Skaters Choice,5121 N Armenia Ave,Tampa,FL,33603,27.993196,-82.483378,4.5,8,1,,,
17615,7cEbbI3wjuGSsJUIGdrnzg,Qmar,424 N Pinellas Ave,Tarpon Springs,FL,34689,28.150341,-82.756441,5.0,7,1,,,
38638,hzHCWiKeIGFJfqyeCqyYdg,Sunstar Ambulance,12490 Ulmerton Rd,Largo,FL,33774,27.886142,-82.810522,1.0,5,1,,,
39187,0_uuEqZbJcUl-qsLh_VOiw,East Richey Lawn Mower & Small Engine Repair,6721 Massachusetts Ave,New Port Richey,FL,34653,28.259008,-82.703231,2.0,8,1,,,
42434,cer9BbiI0dySmonxtleEJA,Babcock Healthcare Education Services,2628 5th Ave N,Saint Petersburg,FL,33713,27.777173,-82.66924,2.5,5,1,,,
56288,sTrYR2vw0sF818AljClf5w,Foundational Health Center,7241 Bryan Dairy Rd,Seminole,FL,33777,27.872999,-82.738997,3.5,5,1,,,
60542,w2qqUDq35WmLU00QNVfnkg,King Logistics,6002 Benjamin Rd,Tampa,FL,33634,28.002098,-82.542488,2.0,5,1,,,
63297,GVohCyKOOYZ0L_x9sU_Guw,Greek Unique,"5025 E Fowler Ave, Ste 18",Tampa,FL,33617,28.053639,-82.40048,3.5,5,1,,,


In [24]:
#Eliminamos los nulos de la columna categories
df_business_florida= df_business_florida.dropna(subset=['categories'])

In [25]:
#Revisamos que los valores se hallan eliminado correctamente
df_business_florida.isnull().sum()

business_id        0
name               0
address            0
city               0
state              0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
is_open            0
attributes      2571
categories         0
hours           3424
dtype: int64

In [26]:
#Eliminamos las columnas que no utilizaré para mi análisis
df_business_florida=df_business_florida.drop(columns=['postal_code','is_open','attributes','hours'])

In [27]:
#Verifico la eliminación de las columnas
df_business_florida.head()

Unnamed: 0,business_id,name,address,city,state,latitude,longitude,stars,review_count,categories
7,qkRM_2X51Yqxk3btlwAQIg,Temple Beth-El,400 Pasadena Ave S,St. Petersburg,FL,27.76659,-82.732983,3.5,5,"Synagogues, Religious Organizations"
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,28.190459,-82.45738,3.5,6,"Department Stores, Shopping, Fashion"
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,27.955269,-82.45632,4.0,10,"Vietnamese, Food, Restaurants, Food Trucks"
13,jaxMSoInw8Poo3XeMJt8lQ,Adams Dental,15 N Missouri Ave,Clearwater,FL,27.966235,-82.787412,5.0,10,"General Dentistry, Dentists, Health & Medical,..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,27.916116,-82.760461,4.5,100,"Food, Delis, Italian, Bakeries, Restaurants"


In [28]:
# Buscar 'restaurant' o 'restaurante' en la columna 'categories'
filtro = df_business_florida['categories'].str.contains('restaurant|restaurante', case=False, regex=True)

In [29]:
# Filtrar el DataFrame para obtener las filas que contienen 'restaurant' o 'restaurante'
df_business_florida = df_business_florida[filtro]

In [30]:
#Verificamos que se haya filtrado correctamente
df_business_florida

Unnamed: 0,business_id,name,address,city,state,latitude,longitude,stars,review_count,categories
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,27.955269,-82.45632,4.0,10,"Vietnamese, Food, Restaurants, Food Trucks"
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,27.916116,-82.760461,4.5,100,"Food, Delis, Italian, Bakeries, Restaurants"
58,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,10440 N Dale Mabry Hwy,Tampa,FL,28.046203,-82.505053,4.0,23,"Restaurants, American (New), Italian"
59,JgpnXv_0XhV3SfbfB50nxw,Joe's Pizza,2038 N Dale Mabry Hwy,Tampa,FL,27.960514,-82.506127,4.0,35,"Restaurants, Pizza"
79,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,3173 Cypress Ridge Blvd,Wesley Chapel,FL,28.196252,-82.380615,4.5,95,"Burgers, Sports Bars, Bars, Lounges, Restauran..."
...,...,...,...,...,...,...,...,...,...,...
150232,Scd-rcsQCn60t1sHHFv-og,First Watch,"4045 N Tyrone Blvd, Ste 204",St. Petersburg,FL,27.808314,-82.75211,3.5,183,"Cafes, Restaurants, Breakfast & Brunch, Americ..."
150249,8MzF1Tlgz0pOkxmhP5dYzA,El Cap Restaurant,3500 4th St N,St. Petersburg,FL,27.80414,-82.638855,3.5,414,"American (Traditional), Burgers, Restaurants"
150262,-bZQH8yjm7ntTyGeLQwh8Q,Farmer's Kitchen Restaurant,3500 E Bay Dr,Largo,FL,27.916787,-82.750395,4.0,6,"Sandwiches, Restaurants, Diners"
150271,BIyT7Kr7tMJqlfp4oOOYQg,Copper Bell Cafe,11228 Boyette Rd,Riverview,FL,27.853745,-82.316887,3.5,49,"Breakfast & Brunch, Cafes, Restaurants"


In [31]:
#Exportamos a csv
df_business_florida.to_csv('yelp_business_florida.csv', index=False)

## Reviews

In [32]:
#Importamos en un dataframe la información sobre las review
with open("Data/Yelp/review.json",'r', encoding='utf-8') as f:
    review=[json.loads(line) for line in f]
df_review=pd.DataFrame(review)

In [33]:
#Observamos la información de nuestro dataframe
df_review.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [34]:
#Verificamos las dimensiones de nuestro dataframe
df_review.shape

(6990280, 9)

In [35]:
#Revisamos si tenemos valores nulos
df_review.isnull().sum()

review_id      0
user_id        0
business_id    0
stars          0
useful         0
funny          0
cool           0
text           0
date           0
dtype: int64

In [36]:
#Haremos un merge entre el dataframe de df_business_florida y df_reviews para filtrar las reviews que son del estado de Florida
df_reviews_florida=df_review.merge(df_business_florida, on='business_id', how='inner')

In [37]:
#Chequeamos que se haya realizado correctamente el merge
df_reviews_florida.head()


Unnamed: 0,review_id,user_id,business_id,stars_x,useful,funny,cool,text,date,name,address,city,state,latitude,longitude,stars_y,review_count,categories
0,OAhBYw8IQ6wlfw1owXWRWw,1C2lxzUo1Hyye4RFIXly3g,BVndHaLihEYbr76Z0CMEGw,5.0,0,0,0,"Great place for breakfast! I had the waffle, w...",2014-10-11 16:22:06,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
1,R3TNDNoRUiVfRgvvczy0mg,-Dt5o6GpQcXQfVeWpHNtDg,BVndHaLihEYbr76Z0CMEGw,5.0,1,1,1,We came here based on the recommendation we re...,2018-04-15 19:51:54,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
2,OZpHUjMx5vyK0Hn2Uim_AQ,kiTsCsc_vtGXnzVz738w2g,BVndHaLihEYbr76Z0CMEGw,5.0,0,0,0,We found this place by searching yelp and it d...,2014-11-15 16:49:09,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
3,dmjtUSlyc-3EA1Tv26AWJw,NKFBcrL56W7eHxPXxyPTxA,BVndHaLihEYbr76Z0CMEGw,2.0,0,0,0,"Had supper about 5:00pm, 7/10/17, and was disa...",2017-07-10 21:48:06,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
4,kcV2upXjWLWuJPAt9QICbw,ZuF1R91KH924zJwPTmFi4g,BVndHaLihEYbr76Z0CMEGw,4.0,0,0,0,Good little mom and pop breakfast place. The f...,2017-09-23 13:20:51,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."


In [38]:
#Eliminamos las columnas que no utilizaremos
df_reviews_florida.drop(columns=['useful', 'funny', 'cool', 'address', 'state', 'stars_y'])

Unnamed: 0,review_id,user_id,business_id,stars_x,text,date,name,city,latitude,longitude,review_count,categories
0,OAhBYw8IQ6wlfw1owXWRWw,1C2lxzUo1Hyye4RFIXly3g,BVndHaLihEYbr76Z0CMEGw,5.0,"Great place for breakfast! I had the waffle, w...",2014-10-11 16:22:06,Mamas Kitchen,Tampa,27.884852,-82.506004,162,"Sandwiches, Restaurants, American (New), Ameri..."
1,R3TNDNoRUiVfRgvvczy0mg,-Dt5o6GpQcXQfVeWpHNtDg,BVndHaLihEYbr76Z0CMEGw,5.0,We came here based on the recommendation we re...,2018-04-15 19:51:54,Mamas Kitchen,Tampa,27.884852,-82.506004,162,"Sandwiches, Restaurants, American (New), Ameri..."
2,OZpHUjMx5vyK0Hn2Uim_AQ,kiTsCsc_vtGXnzVz738w2g,BVndHaLihEYbr76Z0CMEGw,5.0,We found this place by searching yelp and it d...,2014-11-15 16:49:09,Mamas Kitchen,Tampa,27.884852,-82.506004,162,"Sandwiches, Restaurants, American (New), Ameri..."
3,dmjtUSlyc-3EA1Tv26AWJw,NKFBcrL56W7eHxPXxyPTxA,BVndHaLihEYbr76Z0CMEGw,2.0,"Had supper about 5:00pm, 7/10/17, and was disa...",2017-07-10 21:48:06,Mamas Kitchen,Tampa,27.884852,-82.506004,162,"Sandwiches, Restaurants, American (New), Ameri..."
4,kcV2upXjWLWuJPAt9QICbw,ZuF1R91KH924zJwPTmFi4g,BVndHaLihEYbr76Z0CMEGw,4.0,Good little mom and pop breakfast place. The f...,2017-09-23 13:20:51,Mamas Kitchen,Tampa,27.884852,-82.506004,162,"Sandwiches, Restaurants, American (New), Ameri..."
...,...,...,...,...,...,...,...,...,...,...,...,...
792112,qDlqRGqY2psedI7VTixwaQ,4eUV_L1SV4UXlKXL4M7cnA,dms2DI0DgFicvwLPlELlKA,5.0,This is a great European Market with fresh veg...,2020-01-25 22:46:30,Kalina's Coffee & European Food,Saint Petersburg,27.830603,-82.645942,9,"Restaurants, Delis, Modern European, Coffee & ..."
792113,WmmYAopcUHQCjKSIkPWD5Q,w-Dy7B6aZbBImMTVqyYJCA,dms2DI0DgFicvwLPlELlKA,5.0,Having lived in St Pete for a few years I like...,2021-12-15 01:59:18,Kalina's Coffee & European Food,Saint Petersburg,27.830603,-82.645942,9,"Restaurants, Delis, Modern European, Coffee & ..."
792114,_H-RZA5e-mLAU3jY-KkpIg,ZNPC_Ul-q_dGrAErZEwVTg,dms2DI0DgFicvwLPlELlKA,4.0,"To set the record straight, this is not a rest...",2018-08-04 16:12:18,Kalina's Coffee & European Food,Saint Petersburg,27.830603,-82.645942,9,"Restaurants, Delis, Modern European, Coffee & ..."
792115,MYEUStZw8Dz0BAWLETgBfg,QUic4Ja79PQ7wU5fHlBDOQ,dms2DI0DgFicvwLPlELlKA,5.0,What a hidden gem! There are all sorts of Euro...,2020-09-20 23:37:02,Kalina's Coffee & European Food,Saint Petersburg,27.830603,-82.645942,9,"Restaurants, Delis, Modern European, Coffee & ..."


In [39]:
df_reviews_florida = df_reviews_florida.rename(columns={'stars_x': 'stars'})

In [54]:
df_reviews_florida['date'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 792117 entries, 0 to 792116
Series name: date
Non-Null Count   Dtype 
--------------   ----- 
792117 non-null  object
dtypes: object(1)
memory usage: 6.0+ MB


In [55]:
df_reviews_florida['date'] = pd.to_datetime(df_reviews_florida['date'])

In [62]:
df_reviews_florida['date'] = df_reviews_florida['date'].dt.date

In [63]:
df_reviews_florida['date'].min()

datetime.date(2005, 7, 14)

In [64]:
df_reviews_florida['date'].max()

datetime.date(2022, 1, 19)

In [65]:
date_min = pd.to_datetime('2017-01-01').date()
date_max = pd.to_datetime('2022-01-19').date()

In [66]:
df_reviews_florida_date = df_reviews_florida[(df_reviews_florida['date']>= date_min) & (df_reviews_florida['date']<= date_max)]

In [77]:
df_reviews_florida_date

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,name,address,city,state,latitude,longitude,stars_y,review_count,categories
1,R3TNDNoRUiVfRgvvczy0mg,-Dt5o6GpQcXQfVeWpHNtDg,BVndHaLihEYbr76Z0CMEGw,5.0,1,1,1,We came here based on the recommendation we re...,2018-04-15,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
3,dmjtUSlyc-3EA1Tv26AWJw,NKFBcrL56W7eHxPXxyPTxA,BVndHaLihEYbr76Z0CMEGw,2.0,0,0,0,"Had supper about 5:00pm, 7/10/17, and was disa...",2017-07-10,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
4,kcV2upXjWLWuJPAt9QICbw,ZuF1R91KH924zJwPTmFi4g,BVndHaLihEYbr76Z0CMEGw,4.0,0,0,0,Good little mom and pop breakfast place. The f...,2017-09-23,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
8,7VA-MRcyxibjlHexm_hTyw,gvkHURdz5M-bK7XdIbKDgQ,BVndHaLihEYbr76Z0CMEGw,5.0,0,0,0,Great food and comfortable atmosphere and supe...,2018-09-15,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
15,x41uAy2kh0xWyVGQ4PrEew,pk-iV4rlRNvZBQuk_79_og,BVndHaLihEYbr76Z0CMEGw,5.0,0,0,0,"Mamas Kitchen is just like what the name says,...",2018-06-12,Mamas Kitchen,5524 S Dale Mabry Hwy,Tampa,FL,27.884852,-82.506004,4.5,162,"Sandwiches, Restaurants, American (New), Ameri..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792112,qDlqRGqY2psedI7VTixwaQ,4eUV_L1SV4UXlKXL4M7cnA,dms2DI0DgFicvwLPlELlKA,5.0,2,1,2,This is a great European Market with fresh veg...,2020-01-25,Kalina's Coffee & European Food,6393 Dr Martin Luther King Jr St N,Saint Petersburg,FL,27.830603,-82.645942,4.5,9,"Restaurants, Delis, Modern European, Coffee & ..."
792113,WmmYAopcUHQCjKSIkPWD5Q,w-Dy7B6aZbBImMTVqyYJCA,dms2DI0DgFicvwLPlELlKA,5.0,0,0,0,Having lived in St Pete for a few years I like...,2021-12-15,Kalina's Coffee & European Food,6393 Dr Martin Luther King Jr St N,Saint Petersburg,FL,27.830603,-82.645942,4.5,9,"Restaurants, Delis, Modern European, Coffee & ..."
792114,_H-RZA5e-mLAU3jY-KkpIg,ZNPC_Ul-q_dGrAErZEwVTg,dms2DI0DgFicvwLPlELlKA,4.0,5,1,1,"To set the record straight, this is not a rest...",2018-08-04,Kalina's Coffee & European Food,6393 Dr Martin Luther King Jr St N,Saint Petersburg,FL,27.830603,-82.645942,4.5,9,"Restaurants, Delis, Modern European, Coffee & ..."
792115,MYEUStZw8Dz0BAWLETgBfg,QUic4Ja79PQ7wU5fHlBDOQ,dms2DI0DgFicvwLPlELlKA,5.0,6,3,4,What a hidden gem! There are all sorts of Euro...,2020-09-20,Kalina's Coffee & European Food,6393 Dr Martin Luther King Jr St N,Saint Petersburg,FL,27.830603,-82.645942,4.5,9,"Restaurants, Delis, Modern European, Coffee & ..."


In [68]:
#Exportamos a csv
df_reviews_florida_date.to_csv('yelp_reviews_florida.csv', index=False)

## Users

In [69]:
#Leemos la información de los usuarios del .parquet
df_user=pd.read_parquet('Data/Yelp/user.parquet')

In [70]:
#Observamos los datos que tenemos en el dataframe
df_user.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0


In [71]:
#Verificamos las dimensiones del dataframe
df_user.shape

(2105597, 22)

In [72]:
#Revisamos si hay valores nulos
df_user.isnull().sum()

user_id               0
name                  0
review_count          0
yelping_since         0
useful                0
funny                 0
cool                  0
elite                 0
friends               0
fans                  0
average_stars         0
compliment_hot        0
compliment_more       0
compliment_profile    0
compliment_cute       0
compliment_list       0
compliment_note       0
compliment_plain      0
compliment_cool       0
compliment_funny      0
compliment_writer     0
compliment_photos     0
dtype: int64

In [73]:
#Selecciono las columnas de interés
df_user=df_user[['user_id', 'name', 'review_count', 'yelping_since', 'average_stars']]

In [74]:
#Revisamos la correcta seleccion de las columnas
df_user.head()

Unnamed: 0,user_id,name,review_count,yelping_since,average_stars
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,3.91
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,3.74
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,3.32
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,4.27
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,3.54


In [75]:
#Filtro el df de usuarios, con las reviews que son de Florida, para quedarme con un dataframe con solo los usuarios de Florida
df_users_florida = df_user[df_user['user_id'].isin(df_reviews_florida_date['user_id'])]

In [76]:
#Verificamos
df_users_florida

Unnamed: 0,user_id,name,review_count,yelping_since,average_stars
36,rppTTi-kfF8-qyiArNemag,Helen,460,2006-01-24 14:33:32,3.33
69,q7iWal_rXNSHkHeCMMvZxQ,Stephanie,500,2011-06-01 21:02:59,3.95
83,K7thO1n-vZ9PFYiC7nTR2w,Yelper,1554,2007-12-26 23:05:41,3.68
85,asAdx4Q3cAMykgPgtQt6cg,Sylvester,123,2010-08-31 13:25:09,4.20
142,2Od6rQYNvPUXQC2Go7vIqg,Catherine,986,2007-08-22 17:13:14,3.63
...,...,...,...,...,...
2105563,Fdojx8V99xKPZrZ4C_9X6Q,Mel,11,2013-04-18 17:18:02,4.45
2105575,2InIg5itoYPYkWIe4s5hRw,Mercy's,29,2015-07-19 22:13:56,3.31
2105580,DH-aSZOpX0jn3eVpCVokng,Ronny,65,2016-07-26 03:20:05,3.35
2105581,EcT7gTi5S3dNoWMshMXS2A,Kristin,22,2015-07-31 13:41:36,4.46


In [78]:
#Exportamos a csv
df_users_florida.to_csv('yelp_users_florida.csv', index=False)