### ETL realizado sobre el archivo `new_business.csv`, que contiene la información del dataset original `business.pkl` de Yelp

In [1]:
import pandas as pd

In [2]:
csv_file_path = "c:/Users/PC/Documents/RawDataPF_Henry/Yelp/new_business.csv"

# Cargar el objeto desde el archivo .csv
df_business_Yelp = pd.read_csv(csv_file_path, sep=";")

In [3]:
df_business_Yelp.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,0,['ByAppointmentOnly': 'True'],"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,1,['BusinessAcceptsCreditCards': 'True'],"Shipping Centers, Local Services, Notaries, Ma...","['Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,0,"['BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","['Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"['RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","['Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,1,"['BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","['Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


Analizando el registro se observa:
- la columna `state` contiene información errónea, que no se condice con el resto de la información de ese registro
- se procederá a generar una función que vuelva a poblar ese campo a partir de la información del campo `postal_code`. Para ello se empleará la librería `uszipcode`

In [4]:
from uszipcode import SearchEngine

# Crea una instancia del motor de búsqueda
search = SearchEngine()

# Define la función para buscar el estado basado en un código postal
def get_state_from_postal_code(postal_code):
    try:
        result = search.by_zipcode(postal_code)
        return result.state
    except:
        return None

In [5]:
# Aplica la función a cada fila del DataFrame y crea la nueva columna
df_business_Yelp['state_new'] = df_business_Yelp['postal_code'].apply(get_state_from_postal_code)

In [6]:
# Extrae la columna 'state_new' del DataFrame
state_new_column = df_business_Yelp.pop('state_new')

# Encuentra el índice de la columna 'state'
state_index = df_business_Yelp.columns.get_loc('state')

# Inserta la columna 'state_new' justo después de la columna 'state'
df_business_Yelp.insert(state_index + 1, 'state_new', state_new_column)

In [7]:
df_business_Yelp.head()

Unnamed: 0,business_id,name,address,city,state,state_new,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,CA,93101,34.426679,-119.711197,5.0,7,0,['ByAppointmentOnly': 'True'],"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,MO,63123,38.551126,-90.335695,3.0,15,1,['BusinessAcceptsCreditCards': 'True'],"Shipping Centers, Local Services, Notaries, Ma...","['Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,AZ,85711,32.223236,-110.880452,3.5,22,0,"['BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","['Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,PA,19107,39.955505,-75.155564,4.0,80,1,"['RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","['Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,PA,18054,40.338183,-75.471659,4.5,13,1,"['BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","['Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


Filtro para tener sólo los registros de los estados a analizar: California (CA), Florida (FL) y Nevada (NV)

In [8]:
# Lista de valores permitidos en la columna 'state_new'
allowed_states = ["CA", "FL", "NV"]

# Filtra el DataFrame manteniendo solo las filas con estados permitidos
filtered_df = df_business_Yelp[df_business_Yelp['state_new'].isin(allowed_states)]

In [9]:
# Cuenta la cantidad de entradas para cada estado y crea un diccionario con los resultados
state_counts = filtered_df['state_new'].value_counts().to_dict()

# Imprime el diccionario con la cantidad de entradas por estado
for state, count in state_counts.items():
    print(f"Estado: {state}, Cantidad de Entradas: {count}")
# Imprimir la cantidad de registros en el DataFrame
cantidad_registros = len(filtered_df)
print(f'La cantidad de registros en el DataFrame es: {cantidad_registros}')

Estado: FL, Cantidad de Entradas: 26314
Estado: NV, Cantidad de Entradas: 7707
Estado: CA, Cantidad de Entradas: 5200
La cantidad de registros en el DataFrame es: 39221


Elimino la columna original `state` y renombro la columna que se corrigió (pasa de _sate_new_ a _state_)<br>
Cambio el formato de archivo del campo `postal_code` para que sean valores `int64`

In [10]:
filtered_df.head()

Unnamed: 0,business_id,name,address,city,state,state_new,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,CA,93101,34.426679,-119.711197,5.0,7,0,['ByAppointmentOnly': 'True'],"Doctors, Traditional Chinese Medicine, Naturop...",
7,qkRM_2X51Yqxk3btlwAQIg,Temple Beth-El,400 Pasadena Ave S,St. Petersburg,PA,FL,33707,27.76659,-82.732983,3.5,5,1,,"Synagogues, Religious Organizations","['Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ..."
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,FL,34639,28.190459,-82.45738,3.5,6,1,"['RestaurantsPriceRange2': '2', 'BikeParking':...","Department Stores, Shopping, Fashion","['Monday': '9:30-21:30', 'Tuesday': '9:30-21:3..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,MO,FL,33602,27.955269,-82.45632,4.0,10,1,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
13,jaxMSoInw8Poo3XeMJt8lQ,Adams Dental,15 N Missouri Ave,Clearwater,FL,FL,33755,27.966235,-82.787412,5.0,10,1,['ByAppointmentOnly': 'True'],"General Dentistry, Dentists, Health & Medical,...","['Monday': '7:30-15:30', 'Tuesday': '7:30-15:3..."


In [11]:
# Elimina la columna 'state'
filtered_df.drop(columns=['state'], inplace=True)
filtered_df.rename(columns={'state_new': 'state'}, inplace=True)
filtered_df.loc[:,'postal_code'] = filtered_df['postal_code'].astype('int64')
filtered_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=['state'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.rename(columns={'state_new': 'state'}, inplace=True)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,['ByAppointmentOnly': 'True'],"Doctors, Traditional Chinese Medicine, Naturop...",
7,qkRM_2X51Yqxk3btlwAQIg,Temple Beth-El,400 Pasadena Ave S,St. Petersburg,FL,33707,27.76659,-82.732983,3.5,5,1,,"Synagogues, Religious Organizations","['Monday': '9:0-17:0', 'Tuesday': '9:0-17:0', ..."
10,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.45738,3.5,6,1,"['RestaurantsPriceRange2': '2', 'BikeParking':...","Department Stores, Shopping, Fashion","['Monday': '9:30-21:30', 'Tuesday': '9:30-21:3..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
13,jaxMSoInw8Poo3XeMJt8lQ,Adams Dental,15 N Missouri Ave,Clearwater,FL,33755,27.966235,-82.787412,5.0,10,1,['ByAppointmentOnly': 'True'],"General Dentistry, Dentists, Health & Medical,...","['Monday': '7:30-15:30', 'Tuesday': '7:30-15:3..."


Reduzco la cantidad de registros, limitando las entradas a los negocios cuyos campos `categories` se encuentren enumerados en la variable _categorias_filtrado_

In [12]:
categorias_filtrado = ['Grill', 'Pub', 'Deli', 'American', 'Mexican', 'Italian', 'Vietnamese','Ice Cream', 'Brewery', 'Sushi', 'Restaurants', 'Restaurant', 'Hotels', 'Hotel', 'Restaurante', 'Bar', 'Cafe', 'Coffe', 'Pizza', 'Bakery', 'Bakeries', 'Food', 'Diner', 'Bistro']

# Elimino primero las filas con valores NaN en la columna 'categories'
df_business_cleaned = filtered_df.dropna(subset=['categories'])

# Filtro el DataFrame manteniendo sólo las filas que contengan al menos una palabra de categorias_filtrado
df_business_cleaned = df_business_cleaned[df_business_cleaned['categories'].str.contains('|'.join(categorias_filtrado), case=False)]

In [13]:
df_business_cleaned.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
23,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,339,1,"['RestaurantsGoodForGroups': 'True', 'Restaura...","Restaurants, Italian","['Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
54,0qNpTGTcqPwOLi2hADx4Xw,Charlie's Market,2815 E Sligh Ave,Tampa,FL,33610,28.01036,-82.430042,3.0,9,1,"['BusinessParking': ['garage': False, 'street'...","Food, Grocery, Convenience Stores",
58,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,10440 N Dale Mabry Hwy,Tampa,FL,33618,28.046203,-82.505053,4.0,23,0,"['BusinessParking': ['garage': False, 'street'...","Restaurants, American (New), Italian","['Monday': '11:30-21:0', 'Tuesday': '11:30-21:..."


In [14]:
# Cuenta la cantidad de entradas para cada estado y crea un diccionario con los resultados
state_counts = df_business_cleaned['state'].value_counts().to_dict()

# Imprime el diccionario con la cantidad de entradas por estado
for state, count in state_counts.items():
    print(f"Estado: {state}, Cantidad de Entradas: {count}")
# Imprimir la cantidad de registros en el DataFrame
cantidad_registros = len(df_business_cleaned)
print(f'La cantidad de registros en el df_business_cleaned es: {cantidad_registros}')

Estado: FL, Cantidad de Entradas: 12794
Estado: NV, Cantidad de Entradas: 2752
Estado: CA, Cantidad de Entradas: 2024
La cantidad de registros en el df_business_cleaned es: 17570


Este Dataframe lo guardo como `business_final`.

In [15]:
df_business_cleaned.to_csv('c:/Users/PC/Documents/RawDataPF_Henry/business_final.csv')

#### Preparación de los archivos Yelp_shops, Ubicacion_Yelp_shops, Yelp_cateria_negocio, Yelp_categories y Yelp_dia_horario para el diagrama de Entidad Relación
- 1º Genero un nuevo DataFrame Yelp Shops de acuerdo al diagrama de Entidad Relación<br>
Trabajo con los datos recopilados desde Yelp en el archivo `Yelp_shops.csv`<br>
Este DataFrame se llamará df_Yelp_shops y se unirá posteriormente con la misma información proveniente de Google para generar una única tabla en el modelo.

In [16]:
# Esta funcion genera un ID uniendo con un '_' los valores de latitud y longitud, redondeados a 6 dígitos. Luego chequea que no haya duplicados.
# En caso de haber duplicados, asigna un nuevo valor para evitar la duplicación.

def generate_unique_id(df):
    # Genera el campo id_shops combinando latitude y longitude redondeados a 6 dígitos
    df['id_shop'] = df.apply(lambda row: f"{row['latitude']:.6f}_{row['longitude']:.6f}", axis=1)
    
    # Encuentra registros duplicados basados en id_shops
    duplicates = df[df.duplicated(subset=['id_shop'], keep=False)]
    
    # Itera sobre los registros duplicados y asigna nuevos valores de id_shop
    for idx, duplicate_row in duplicates.iterrows():
        current_id = f"{duplicate_row['latitude']:.6f}_{duplicate_row['longitude']:.6f}"
        new_id = current_id
        counter = 1
        while new_id in df['id_shop'].values:
            counter += 1
            new_id = f"{current_id}_{counter}"
        
        df.loc[idx, 'id_shop'] = new_id
    
    return df

In [17]:
# Llamada a la función generate_unique_id para crear el campo id_shops
df_business_cleaned = generate_unique_id(df_business_cleaned)
# Imprimir la cantidad de registros en el DataFrame
cantidad_registros = len(df_business_cleaned)
print(f'La cantidad de registros en df_business_cleaned es: {cantidad_registros}')

La cantidad de registros en df_business_cleaned es: 17570


In [18]:
# Crear el DataFrame df_Yelp_shops con los campos requeridos
df_Yelp_shops = pd.DataFrame()

# Aplico la función generate_unique_id(df) para generar los id_shops

df_Yelp_shops['id_shop'] = df_business_cleaned['id_shop']

# Obtener el nombre (name) directamente del DataFrame df_business
df_Yelp_shops['name'] = df_business_cleaned['name']

# Obtener las estrellas (stars) directamente del DataFrame df_business
df_Yelp_shops['stars'] = df_business_cleaned['stars']

# Genero un nuevo campo 'source' que identifica de dónde proviene la información: 1: Yelp, 2: Google
df_Yelp_shops['source'] = 1

# Obtener el id_shop a partir del business_id
df_Yelp_shops['id_source'] = df_business_cleaned['business_id']

# Mostrar el DataFrame df_Yelp_shops resultante
df_Yelp_shops.head()


Unnamed: 0,id_shop,name,stars,source,id_source
11,27.955269_-82.456320_2,Vietnamese Food Truck,4.0,1,eEOYSgkmpB90uNA7lDOMRA
14,27.916116_-82.760461_2,Zio's Italian Market,4.5,1,0bPLkL0QhhPO5kt1_EXmNQ
23,39.476117_-119.789339,Romano's Macaroni Grill,2.5,1,9OG5YkX1g2GReZM0AskizA
54,28.010360_-82.430042,Charlie's Market,3.0,1,0qNpTGTcqPwOLi2hADx4Xw
58,28.046203_-82.505053,Roman Forum,4.0,1,uI9XODGY_2_ieTE6xJ0myw


In [19]:
df_Yelp_shops.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17570 entries, 11 to 150317
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id_shop    17570 non-null  object 
 1   name       17570 non-null  object 
 2   stars      17570 non-null  float64
 3   source     17570 non-null  int64  
 4   id_source  17570 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 1.3+ MB


In [20]:
# Verifico que no existan valores duplicados en id_shop
duplicates = df_Yelp_shops['id_shop'].duplicated()
total_duplicates = duplicates.sum()

if total_duplicates > 0:
    print(f"El DataFrame tiene {total_duplicates} valores duplicados en la columna 'id_shop'.")
else:
    print("El DataFrame no tiene valores duplicados en la columna 'id_shop'.")


El DataFrame no tiene valores duplicados en la columna 'id_shop'.


In [21]:
# Guardo el nuevo dataframe como '.csv'
df_Yelp_shops.to_csv('c:/Users/PC/Dropbox/HENRY/DATA-FT-12/Lab 03 - PF/DiagramaER/Yelp_shops.csv', index = False, sep=';')

- 2º Genero el df_Ubicacion_Yelp_shops

In [22]:
# Crear el DataFrame df_Ubicacion_Yelp_shops con los campos requeridos
df_Ubicacion_Yelp_shops = pd.DataFrame()

# # Calcular el id_location como un hash code a partir de latitude y longitude
# def hash_location(row):
#     location_str = f"{row['latitude']},{row['longitude']}"
#     return hashlib.sha256(location_str.encode()).hexdigest()

df_Ubicacion_Yelp_shops['id_shop'] = df_business_cleaned['id_shop']

# Obtener el state directamente del DataFrame df_business
df_Ubicacion_Yelp_shops['state'] = df_business_cleaned['state']

# Obtener latitude, longitude, postal_code, city, y address directamente del DataFrame df_business
df_Ubicacion_Yelp_shops['latitude'] = df_business_cleaned['latitude']
df_Ubicacion_Yelp_shops['longitude'] = df_business_cleaned['longitude']
df_Ubicacion_Yelp_shops['postal_code'] = df_business_cleaned['postal_code']
df_Ubicacion_Yelp_shops['city'] = df_business_cleaned['city']
df_Ubicacion_Yelp_shops['address'] = df_business_cleaned['address']

# Mostrar el DataFrame df_Ubicacion_Yelp_shops resultante
df_Ubicacion_Yelp_shops.head()


Unnamed: 0,id_shop,state,latitude,longitude,postal_code,city,address
11,27.955269_-82.456320_2,FL,27.955269,-82.45632,33602,Tampa Bay,
14,27.916116_-82.760461_2,FL,27.916116,-82.760461,33771,Largo,2575 E Bay Dr
23,39.476117_-119.789339,NV,39.476117,-119.789339,89502,Reno,5505 S Virginia St
54,28.010360_-82.430042,FL,28.01036,-82.430042,33610,Tampa,2815 E Sligh Ave
58,28.046203_-82.505053,FL,28.046203,-82.505053,33618,Tampa,10440 N Dale Mabry Hwy


In [23]:
df_Ubicacion_Yelp_shops.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17570 entries, 11 to 150317
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id_shop      17570 non-null  object 
 1   state        17570 non-null  object 
 2   latitude     17570 non-null  float64
 3   longitude    17570 non-null  float64
 4   postal_code  17570 non-null  object 
 5   city         17570 non-null  object 
 6   address      17211 non-null  object 
dtypes: float64(2), object(5)
memory usage: 1.6+ MB


In [24]:
# Guardo el nuevo dataframe como '.csv'
df_Ubicacion_Yelp_shops.to_csv('c:/Users/PC/Dropbox/HENRY/DATA-FT-12/Lab 03 - PF/DiagramaER/Ubicacion_Yelp_shops.csv', index = False)

- 3º Genero el df_Yelp_categories

In [25]:
# Divide las filas en el DataFrame según los valores de la columna 'categories'
df_category_business = df_business_cleaned.copy()
df_category_business['categories'] = df_category_business['categories'].str.split(', ')

# Utiliza explode para crear una fila separada para cada valor en la lista de categorías
df_category_business = df_category_business.explode('categories')
df_category_business.head(10)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,id_shop
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Vietnamese,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Restaurants,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food Trucks,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Food,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Delis,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Italian,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Bakeries,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Restaurants,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
23,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,339,1,"['RestaurantsGoodForGroups': 'True', 'Restaura...",Restaurants,"['Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",39.476117_-119.789339


In [26]:
df_category_business.drop(columns=['is_open'], inplace=True)

In [27]:
df_category_business.head(10)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,id_shop
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Vietnamese,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Restaurants,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food Trucks,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Food,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Delis,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Italian,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Bakeries,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Restaurants,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
23,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,339,"['RestaurantsGoodForGroups': 'True', 'Restaura...",Restaurants,"['Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",39.476117_-119.789339


In [28]:
df_category_business.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82557 entries, 11 to 150317
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   82557 non-null  object 
 1   name          82557 non-null  object 
 2   address       80638 non-null  object 
 3   city          82557 non-null  object 
 4   state         82557 non-null  object 
 5   postal_code   82557 non-null  object 
 6   latitude      82557 non-null  float64
 7   longitude     82557 non-null  float64
 8   stars         82557 non-null  float64
 9   review_count  82557 non-null  int64  
 10  attributes    79922 non-null  object 
 11  categories    82557 non-null  object 
 12  hours         74225 non-null  object 
 13  id_shop       82557 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 9.4+ MB


In [29]:
# categories_unique = df_category_business['categories'].str.split(', ', expand=True).stack().unique()
# categories_unique_sorted = sorted(categories_unique)
categories_unique_sorted = sorted(categorias_filtrado)
# Crear un nuevo DataFrame con las categorías únicas ordenadas alfabéticamente
df_categorias = pd.DataFrame({'category': categories_unique_sorted})
# Agregar una columna 'id' como el índice incremental
df_categorias['id_category'] = df_categorias.index + 1
# Mover la columna 'id' al principio
df_categorias = df_categorias[['id_category', 'category']]
# df_unique_categories.info()
df_categorias.head(25)


Unnamed: 0,id_category,category
0,1,American
1,2,Bakeries
2,3,Bakery
3,4,Bar
4,5,Bistro
5,6,Brewery
6,7,Cafe
7,8,Coffe
8,9,Deli
9,10,Diner


In [30]:
df_categorias.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id_category  24 non-null     int64 
 1   category     24 non-null     object
dtypes: int64(1), object(1)
memory usage: 512.0+ bytes


In [30]:
df_categorias.to_csv('c:/Users/PC/Dropbox/HENRY/DATA-FT-12/Lab 03 - PF/DiagramaER/categorias.csv', index = False, sep=';')

In [31]:
# Filtrar el DataFrame df_category_business
df_categoria_negocio = df_category_business[df_category_business['categories'].str.contains('|'.join(categorias_filtrado), case=False)]

# Reiniciar el índice si es necesario
df_categoria_negocio.reset_index(drop=True, inplace=True)
df_categoria_negocio.head(20)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,id_shop
0,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Vietnamese,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
1,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
2,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Restaurants,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
3,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food Trucks,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
4,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Food,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
5,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Delis,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
6,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Italian,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
7,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Bakeries,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
8,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Restaurants,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
9,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,339,"['RestaurantsGoodForGroups': 'True', 'Restaura...",Restaurants,"['Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",39.476117_-119.789339


In [32]:
df_categoria_negocio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45132 entries, 0 to 45131
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   45132 non-null  object 
 1   name          45132 non-null  object 
 2   address       44254 non-null  object 
 3   city          45132 non-null  object 
 4   state         45132 non-null  object 
 5   postal_code   45132 non-null  object 
 6   latitude      45132 non-null  float64
 7   longitude     45132 non-null  float64
 8   stars         45132 non-null  float64
 9   review_count  45132 non-null  int64  
 10  attributes    43953 non-null  object 
 11  categories    45132 non-null  object 
 12  hours         40087 non-null  object 
 13  id_shop       45132 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 4.8+ MB


In [33]:
# Crear un diccionario que mapea business_id a id_shops en df_business_cleaned
business_id_to_id_shop = df_business_cleaned.set_index('business_id')['id_shop'].to_dict()

# Agregar el campo id_shop a df_categoria_negocio basado en business_id
df_categoria_negocio['id_shop'] = df_categoria_negocio['business_id'].map(business_id_to_id_shop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categoria_negocio['id_shop'] = df_categoria_negocio['business_id'].map(business_id_to_id_shop)


In [34]:
df_categoria_negocio.head(20)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,id_shop
0,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Vietnamese,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
1,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
2,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Restaurants,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
3,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food Trucks,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
4,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Food,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
5,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Delis,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
6,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Italian,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
7,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Bakeries,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
8,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Restaurants,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
9,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,339,"['RestaurantsGoodForGroups': 'True', 'Restaura...",Restaurants,"['Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",39.476117_-119.789339


In [35]:
# Realiza una fusión (merge) en función de la columna 'category'
df_categoria_negocio = df_categoria_negocio.merge(df_categorias, left_on='categories', right_on='category', how='left')

# Elimina la columna 'category' que ya no es necesaria
df_categoria_negocio.drop(columns=['category'], inplace=True)

In [36]:
df_categoria_negocio.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,id_shop,id_category
0,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Vietnamese,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2,24.0
1,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2,11.0
2,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Restaurants,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2,22.0
3,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...",Food Trucks,"['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2,
4,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...",Food,"['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2,11.0


In [37]:
columns_to_drop = ['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'attributes', 'categories', 'hours']
df_categoria_negocio.drop(columns=columns_to_drop, inplace=True)
df_categoria_negocio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45132 entries, 0 to 45131
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id_shop      45132 non-null  object 
 1   id_category  23337 non-null  float64
dtypes: float64(1), object(1)
memory usage: 705.3+ KB


In [38]:
# Eliminar filas con NaN en la columna "id_category"
df_categoria_negocio.dropna(subset=['id_category'], inplace=True)

# Convertir la columna "id_category" a tipo int64
df_categoria_negocio['id_category'] = df_categoria_negocio['id_category'].astype('int64')

df_categoria_negocio.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23337 entries, 0 to 45130
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id_shop      23337 non-null  object
 1   id_category  23337 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 547.0+ KB


In [39]:
df_categoria_negocio.head(20)

Unnamed: 0,id_shop,id_category
0,27.955269_-82.456320_2,24
1,27.955269_-82.456320_2,11
2,27.955269_-82.456320_2,22
4,27.916116_-82.760461_2,11
6,27.916116_-82.760461_2,16
7,27.916116_-82.760461_2,2
8,27.916116_-82.760461_2,22
9,39.476117_-119.789339,22
10,39.476117_-119.789339,16
11,28.010360_-82.430042,11


In [40]:
df_categoria_negocio.to_csv('c:/Users/PC/Dropbox/HENRY/DATA-FT-12/Lab 03 - PF/DiagramaER/categoria_negocio_Yelp.csv', index = False, sep=';')

4º Genero el df_dia_horarios

In [41]:
df_business_cleaned.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,id_shop
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"['Alcohol': ""'none'"", 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","['Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",27.955269_-82.456320_2
14,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"['OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","['Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...",27.916116_-82.760461_2
23,9OG5YkX1g2GReZM0AskizA,Romano's Macaroni Grill,5505 S Virginia St,Reno,NV,89502,39.476117,-119.789339,2.5,339,1,"['RestaurantsGoodForGroups': 'True', 'Restaura...","Restaurants, Italian","['Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",39.476117_-119.789339
54,0qNpTGTcqPwOLi2hADx4Xw,Charlie's Market,2815 E Sligh Ave,Tampa,FL,33610,28.01036,-82.430042,3.0,9,1,"['BusinessParking': ['garage': False, 'street'...","Food, Grocery, Convenience Stores",,28.010360_-82.430042
58,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,10440 N Dale Mabry Hwy,Tampa,FL,33618,28.046203,-82.505053,4.0,23,0,"['BusinessParking': ['garage': False, 'street'...","Restaurants, American (New), Italian","['Monday': '11:30-21:0', 'Tuesday': '11:30-21:...",28.046203_-82.505053


In [43]:
days = []
open_times = []
close_times = []
business_ids = []

for idx, row in df_business_cleaned.iterrows():
    business_id = row['business_id']
    entry = row['hours']

    if isinstance(entry, str):  # Verificar si entry es una cadena
        entry = entry.replace('[', '').replace(']', '').replace("'", "") # Eliminar caracteres no deseados
        entries = entry.split(', ')
        for e in entries:
            day, time_range = e.split(': ')
            open_time, close_time = time_range.split('-')
            business_ids.append(business_id)
            days.append(day)
            open_times.append(open_time)
            close_times.append(close_time)

df_dia_horarios_Yelp = pd.DataFrame({
    'id_yelp': business_ids,
    'day_name': days,
    'open_time': open_times,
    'close_time': close_times
})

In [44]:
df_dia_horarios_Yelp.head(10)

Unnamed: 0,id_yelp,day_name,open_time,close_time
0,eEOYSgkmpB90uNA7lDOMRA,Monday,11:0,14:0
1,eEOYSgkmpB90uNA7lDOMRA,Tuesday,11:0,14:0
2,eEOYSgkmpB90uNA7lDOMRA,Wednesday,11:0,14:0
3,eEOYSgkmpB90uNA7lDOMRA,Thursday,11:0,14:0
4,eEOYSgkmpB90uNA7lDOMRA,Friday,11:0,14:0
5,eEOYSgkmpB90uNA7lDOMRA,Saturday,5:0,10:0
6,eEOYSgkmpB90uNA7lDOMRA,Sunday,15:0,18:0
7,0bPLkL0QhhPO5kt1_EXmNQ,Monday,10:0,18:0
8,0bPLkL0QhhPO5kt1_EXmNQ,Tuesday,10:0,20:0
9,0bPLkL0QhhPO5kt1_EXmNQ,Wednesday,10:0,20:0


In [45]:
# Función para formatear las horas
def format_hour(hour):
    if pd.notna(hour):
        hour_parts = hour.split(':')  # Divide la cadena en partes
        formatted_hour = f'{hour_parts[0]:0>2}:{hour_parts[1]:0<2}'  # Formato 'XX:00'
        return formatted_hour
    return None

# Aplica la función a las columnas 'open_time' y 'close_time'
df_dia_horarios_Yelp['open_time'] = df_dia_horarios_Yelp['open_time'].apply(format_hour)
df_dia_horarios_Yelp['close_time'] = df_dia_horarios_Yelp['close_time'].apply(format_hour)
df_dia_horarios_Yelp.head(10)

Unnamed: 0,id_yelp,day_name,open_time,close_time
0,eEOYSgkmpB90uNA7lDOMRA,Monday,11:00,14:00
1,eEOYSgkmpB90uNA7lDOMRA,Tuesday,11:00,14:00
2,eEOYSgkmpB90uNA7lDOMRA,Wednesday,11:00,14:00
3,eEOYSgkmpB90uNA7lDOMRA,Thursday,11:00,14:00
4,eEOYSgkmpB90uNA7lDOMRA,Friday,11:00,14:00
5,eEOYSgkmpB90uNA7lDOMRA,Saturday,05:00,10:00
6,eEOYSgkmpB90uNA7lDOMRA,Sunday,15:00,18:00
7,0bPLkL0QhhPO5kt1_EXmNQ,Monday,10:00,18:00
8,0bPLkL0QhhPO5kt1_EXmNQ,Tuesday,10:00,20:00
9,0bPLkL0QhhPO5kt1_EXmNQ,Wednesday,10:00,20:00


In [46]:
df_dia_horarios_Yelp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98816 entries, 0 to 98815
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id_yelp     98816 non-null  object
 1   day_name    98816 non-null  object
 2   open_time   98816 non-null  object
 3   close_time  98816 non-null  object
dtypes: object(4)
memory usage: 3.0+ MB


In [49]:
# Crear un diccionario que mapea business_id a id_shops en df_business_cleaned
business_id_to_id_shop = df_business_cleaned.set_index('business_id')['id_shop'].to_dict()

# Agregar el campo id_shop a df_dia_horarios_Yelp basado en business_id
df_dia_horarios_Yelp['id_shop'] = df_dia_horarios_Yelp['id_yelp'].map(business_id_to_id_shop)

In [50]:
df_dia_horarios_Yelp.head()

Unnamed: 0,id_yelp,day_name,open_time,close_time,id_shop
0,eEOYSgkmpB90uNA7lDOMRA,Monday,11:00,14:00,27.955269_-82.456320_2
1,eEOYSgkmpB90uNA7lDOMRA,Tuesday,11:00,14:00,27.955269_-82.456320_2
2,eEOYSgkmpB90uNA7lDOMRA,Wednesday,11:00,14:00,27.955269_-82.456320_2
3,eEOYSgkmpB90uNA7lDOMRA,Thursday,11:00,14:00,27.955269_-82.456320_2
4,eEOYSgkmpB90uNA7lDOMRA,Friday,11:00,14:00,27.955269_-82.456320_2


In [51]:
# Guardo el nuevo dataframe como '.csv'
df_dia_horarios_Yelp.to_csv('c:/Users/PC/Dropbox/HENRY/DATA-FT-12/Lab 03 - PF/DiagramaER/dia_horarios_Yelp.csv', index = False)