In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb: 3000"

import torch

print('Cuda ststus: %s' % str(torch.cuda.is_available()))
print('Divices available: %d' % torch.cuda.device_count())
print('Device name: %s' % torch.cuda.get_device_name())

Cuda ststus: True
Divices available: 1
Device name: NVIDIA GeForce GTX 1650


### Spark Session

In [2]:
import findspark
findspark.init('C:\spark')

from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName('Reviews Session')
         .config('spark.executor.memory', '4g')
         .config('spark.executor.cores', '2')
         .config('spark.driver.memory', '2g')
         .getOrCreate())

spark

### Getting unzipped files from other file path

In [3]:
from file_handling_tools import FileHandling

# This will decompress the multiple zip files.
# Having the input directory where the zip files are, and the output directory where the unzipped files will be stored.

input_directory = r"C:\Users\jdieg\Desktop\henry\proyectos\FINAL\data\google_maps"
output_directory = r"C:\Users\jdieg\Desktop\data_camp\projects\restaurant_reviews\data\google"

# file_handling = FileHandling(input_directory, output_directory)

# # Execute the decompression process (Execute only for the first time)
# file_handling.unzip_folders()

### Have a look at Data Files.
On:
* metadata-sitios (json files)
* reviews (json files)

To get fields names, type of data, and so on..

#### metadata-sitios (json file)

In [4]:
# Multi Loading json files for metadata
metadata_df = spark.read.json("data/google/*00[2-3]/*/metadata*/*.json")

In [5]:
metadata_df.count()

3025011

In [6]:
# Print the Schema to have a lot at structure DataFrame
metadata_df.printSchema()

root
 |-- MISC: struct (nullable = true)
 |    |-- Accessibility: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Activities: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Amenities: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Atmosphere: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Crowd: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Dining options: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- From the business: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Getting here: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Health & safety: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Health and safety: array (nullable = true)


It seems there are few fields with struct type, in other words, nested data which can be any type of it, since a simple list, dictionary, dictionary with nested disctionaries even so.. let's leave it for latter. 

And it'd be nice to have a cleaner visualization for the nested columns.  I created a simple class in the info_spark_dataset library to only print column names in the style of points 

In [7]:
# from info_spark_dataset import InfoDataSet
# info_data = InfoDataSet(metadata_df._jdf, metadata_df.sparkSession)

Data Dictionary for metadata

Columns:
	
1. MISC: Nested column, for latter.
2. address: 
3. avg_rating
4. category
5. description
6. gmap_id
7. hours
8. latitude
9. longitude
10. name
11. num_of_reviews
12. price
13. relative_results
14. state
15. url

Since we are interested on only restaurants, let's begin filtering out what places are restaurants, this may reduce dataset.

### Category

In [8]:
# Have a look at some rows on category
metadata_df.select('category').show(100, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category                                                                                                                                                                  |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[Pharmacy]                                                                                                                                                                |
|[General store, ATM]                                                                                                                                                      |
|[Pet groomer]                                                                                                                         

There are multiple categories that are not necesary restaurants. There are many ways to approach this, but here, I will get for good to filter any rows's category that contains at least 1 word -"Restaurant"-

To get this done:
* Create an User Defined Functions:
    * That flags the counting of any "restaurant" occurance
* Create a new column with the prior counting of "restaurant" word
* Filter out any row that flags more than 0 rows counting

In [9]:

import pyspark.sql.functions as F
from pyspark.sql.types import *


def filter_category(target_cat, categories):
    """
    Filter categories based on a target category.

    This function takes a target category and a list of categories as input.
    It counts the number of categories containing the target category (case-insensitive) within them.
    
    Parameters:
    - target_cat (str): The target category to filter.
    - categories (list of str): A list of categories to search within.
    
    Returns:
    - int: The count of categories containing the target category (case-insensitive).

    Example:
    ```python
    target_category = 'Restaurant'
    categories = ['Fast Food Restaurant', 'Italian Restaurant', 'Café']
    count = filter_category(target_category, categories)
    print(count)  # Output: 2
    ```
    """
    # If categories is None, return 0
    if categories is None:
        return 0
    # Count the number of categories containing the target category (case-insensitive)
    return len([category for category in categories if target_cat.lower() in {cat.lower() for cat in category.split()}])


# Define a User Defined Function (UDF) using the filter_category function
udf_FilterCategory = F.udf(filter_category, IntegerType())

# Apply the UDF to the DataFrame to create a new column 'cat_count_temp' containing category counts
metadata_df = metadata_df.withColumn('cat_count_temp', udf_FilterCategory(F.lit('Restaurant'), metadata_df.category))
# Filter the DataFrame to include only rows where 'cat_count_temp' is greater than 0
output_dataframe = metadata_df.filter(metadata_df.cat_count_temp.cast('int') > 0)


In [10]:
restaurants_metadata = metadata_df.filter(metadata_df.cat_count_temp.cast('int') > 0)
restaurants_metadata.count()

212008

### Finally, our dataset containing only Restaurant as category.

In [11]:
restaurants_metadata = restaurants_metadata.drop(restaurants_metadata['cat_count_temp'])

In [12]:
# restaurants_metadata.count(), len(restaurants_metadata.columns) --> (212008, 15)

In [13]:
restaurants_metadata.show()

+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+--------------+-----+--------------------+--------------------+--------------------+
|                MISC|             address|avg_rating|            category|         description|             gmap_id|               hours|          latitude|         longitude|                name|num_of_reviews|price|    relative_results|               state|                 url|
+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+------------------+------------------+--------------------+--------------+-----+--------------------+--------------------+--------------------+
|{[Wheelchair acce...|The Jambalaya Sho...|       4.0|[Cajun restaurant...|Casual eatery kno...|0x862722f48f2b47e...|[[Wednesday, 10:3...|        30.50155

In [14]:
google_ids = restaurants_metadata.select('gmap_id')

In [15]:
google_ids = google_ids.withColumnRenamed('gmap_id','gmap_id_1')

### Extracting reviews from Google Json Files

In [16]:
# First read all file from the data folder in the google_review_restaurants.
# Remember, PySpark supports global file paths to leaveare wild cards in directories
# Wherever a .json file is.
google_df = spark.read.json("data/google/*/*/reviews-estados/review-*/*.json")

In [17]:
google_df.count(), google_df.columns

(89946359,
 ['gmap_id', 'name', 'pics', 'rating', 'resp', 'text', 'time', 'user_id'])

In [18]:
google_df.dtypes

[('gmap_id', 'string'),
 ('name', 'string'),
 ('pics', 'array<struct<url:array<string>>>'),
 ('rating', 'bigint'),
 ('resp', 'struct<text:string,time:bigint>'),
 ('text', 'string'),
 ('time', 'bigint'),
 ('user_id', 'string')]

In order to filter out all reviews that are related restaurantes, we are going to use the last restaurants data frame, and use the gmap_id field to join it to the reviews so by right joining will allow to get only all those rows in the restaurante dataframe.

In [135]:
# For a beter memory usage, we can broadcast the smaller dataframe so all resourses will concentrate to join on the fields values
from pyspark.sql.functions import broadcast

restaurant_reviews = google_df.join(
    # Broadcast the smaller dataframe
    broadcast(google_ids), 
    # Join on gmap_id 
    google_df['gmap_id'] == google_ids['gmap_id_1'], 
    # To the right -google_ids datafrane-
    how='right')

In [136]:
restaurant_reviews = restaurant_reviews.drop(restaurant_reviews['pics'])
restaurant_reviews = restaurant_reviews.drop(restaurant_reviews['resp'])
restaurant_reviews = restaurant_reviews.drop(restaurant_reviews['gmap_id_1'])


In [137]:
# restaurant_reviews.show(20)

In [138]:
# Filtering out nulls on texts column
text_filter = ~restaurant_reviews['text'].isNull()
restaurant_reviews = restaurant_reviews.where(text_filter)

In [139]:
# restaurant_reviews.count()
# this counting should compute 7093988

LEt's get rid of dulicates so we can get a clean table of unnecesary rows

In [140]:
restaurant_reviews = restaurant_reviews.drop_duplicates()
# restaurant_reviews.count()

Let's convert dates field, from unix date to date format. It will be used for partition.

In [141]:
from pyspark.sql.functions import from_unixtime, year

restaurant_reviews = restaurant_reviews.withColumn('date', from_unixtime(restaurant_reviews.time / 1000))
restaurant_reviews = restaurant_reviews.drop(restaurant_reviews['time'])

In [142]:
restaurant_reviews = restaurant_reviews.withColumn('year', year("date"))

Treating `\n` and `,` as plain text by adding an extra coma to text reviews will ensure that those two characters are going to be taken as text and not as delimeters

In [143]:
from pyspark.sql.functions import col, regexp_replace, regexp_extract, when

# Replace double quotes with two quoptes to escape them
restaurant_reviews = restaurant_reviews.withColumn('text', regexp_replace(col("text"), '"', '""'))

# Replace the '\n' with period with space in forward
restaurant_reviews = restaurant_reviews.withColumn('text', regexp_replace(col('text'), '\n', '. '))

# Enclose text in double quotes
restaurant_reviews = restaurant_reviews.withColumn('text', col('text').cast('string'))

# rename column
restaurant_reviews = restaurant_reviews.withColumnRenamed('text', 'reviews')

In [144]:
# Define a resgular expression to extract text following "(translated by Google)"
pattern = r'\(Translated by Google\)(.*)?\(Original\)'

# Extract text
restaurant_reviews = restaurant_reviews.withColumn('reviews', 
                                                   when(col("reviews").startswith('(Translated by Google)'), regexp_extract("reviews", pattern, 1))
                                                   .otherwise(col("reviews")))

In [145]:
## Write csv files partitioned by year
restaurant_reviews \
    .coalesce(1) \
    .write \
    .partitionBy("year") \
    .option("header", "true") \
    .option("escape", "\"") \
    .mode("overwrite") \
    .csv("data/reviews_data")

## Trying out LLMs for labeling reviews

In [146]:
df = pd.read_csv("data\\reviews_data\year=2017\part-00000-d16ca764-2ee1-40e2-a290-4d950dfa6152.c000.csv")

In [147]:
df

Unnamed: 0,gmap_id,name,rating,reviews,user_id,date
0,0x875499753a7ad067:0x6a377b89b61fd24f,Spencer Hanks,4,"I am a fan, very friendly folks, small town fe...",105772155185063591657,2017-04-13 19:17:09
1,0x87ba516f42fd107b:0x8b7bbc86bfaadecb,Lucinda Pickell,5,Great food and people. Wonderful staff.,115549324567733574577,2017-02-17 21:03:30
2,0x87528880fabd2f47:0x373f5ae581ba6518,Cory Vore,5,Best gluten free ever. So sad they're gone now.,116697624649953496211,2017-08-05 01:58:07
3,0x879571a7455a555f:0x30a0675b1aa64d29,Lucinda Rieschick,5,Wonderful. Chicken day today.,117867467651010443439,2017-08-31 10:36:14
4,0x87547de0ed23364d:0x5b25d19bde61e5d,Michael Salisbury,4,Food was good and there are chairs an DC boths!,104357590880035433069,2017-03-18 17:27:04
...,...,...,...,...,...,...
853317,0x80db7e77fc4e4cc7:0x78d4b2dc41facbcb,Lucille Kagehokousha,5,"Fresh Udon with clams, hand rolled tuna, great...",112326634882347760965,2017-09-05 22:28:08
853318,0x80c8db302c15da39:0xaefa98832a6a0510,Lucinda Brinks,4,Great salsa bar,106063786723215410976,2017-03-09 21:46:25
853319,0x89b7c66ca8c285f1:0x2f2ca55983a6220a,Lucinda Escobar,5,Friendly staff. Latin food. .,113374007153001499740,2017-09-25 18:55:04
853320,0x89c6c88511ad8cd9:0xbbfd036ebe4061c6,Lucinda Megill Legendre,5,So Delicious. Great drinks. Fun atmosphere. Ch...,105689843109706085917,2017-04-09 18:20:37
