In [15]:
# Importing files
import csv
import os
import sys
# Spark imports
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc
from pyspark.sql.functions import col, asc, desc

from pyspark.sql.functions import array_contains # used to check if the array has a value

In [2]:
# Initialize a spark session.
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [3]:
# BUSINESS

In [4]:
spark = init_spark() # Initializate spark

In [5]:
path = 'data/yelp_academic_dataset_business.json'
df_business = spark.read.json(path)

In [6]:
# Print the number of rows and columns
print((df_business.count(), len(df_business.columns)))

(150346, 14)


In [7]:
#Printing the columns
df_business.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [51]:
df_business.dtypes

[('address', 'string'),
 ('attributes',
  'struct<AcceptsInsurance:string,AgesAllowed:string,Alcohol:string,Ambience:string,BYOB:string,BYOBCorkage:string,BestNights:string,BikeParking:string,BusinessAcceptsBitcoin:string,BusinessAcceptsCreditCards:string,BusinessParking:string,ByAppointmentOnly:string,Caters:string,CoatCheck:string,Corkage:string,DietaryRestrictions:string,DogsAllowed:string,DriveThru:string,GoodForDancing:string,GoodForKids:string,GoodForMeal:string,HairSpecializesIn:string,HappyHour:string,HasTV:string,Music:string,NoiseLevel:string,Open24Hours:string,OutdoorSeating:string,RestaurantsAttire:string,RestaurantsCounterService:string,RestaurantsDelivery:string,RestaurantsGoodForGroups:string,RestaurantsPriceRange2:string,RestaurantsReservations:string,RestaurantsTableService:string,RestaurantsTakeOut:string,Smoking:string,WheelchairAccessible:string,WiFi:string>'),
 ('business_id', 'string'),
 ('categories', 'string'),
 ('city', 'string'),
 ('hours',
  'struct<Friday:st

In [8]:
# Printing how many Null files each categorie have
for i in df_business.columns:
    print(i, df_business.filter(col(i).isNull()).count())

address 0
attributes 13744
business_id 0
categories 103
city 0
hours 23223
is_open 0
latitude 0
longitude 0
name 0
postal_code 0
review_count 0
stars 0
state 0


In [21]:
#df_business.select("city").distinct().sort("city").show(200) # Printing the states in the dataset

#df_business.filter(df_business.latitude >= 43.0).select('state').distinct().show()
#df_business.show(1) # Printing the states in the dataset
#df_business.select("state").distinct().show(100) # Printing the states in the dataset
#df_business.filter(df_business.city == "Chicago").count()  # Couting samples of this city

df_business.select("categories").show(25, False) # Printing the categories in the dataset for the first 25 instances

+----------------------------------------------------------------------------------------------------------+
|categories                                                                                                |
+----------------------------------------------------------------------------------------------------------+
|Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists|
|Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services                            |
|Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores                        |
|Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries                                                     |
|Brewpubs, Breweries, Food                                                                                 |
|Burgers, Fast Food, Sandwiches, Food, Ice Cream & Frozen Yogurt, Restaurants                              |
|Sporting Goods, Fa

In [63]:
df_business.select("attributes").show(10, False) # Printing the categories in the dataset for the first 25 instances

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|attributes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
+---------------

In [54]:
#df_business_filtered = df_business.where(array_contains(col("categories"),"Restaurants"))
df_business_filtered = df_business.filter(df_business.categories.contains('Restaurants'))

In [55]:
# Print the number of rows and columns of the dataframe filtered
print((df_business_filtered.count(), len(df_business_filtered.columns)))

(52268, 14)


In [57]:
# Checking if the filtering is working properly
df_business_filtered.select("categories").show(25, False) # Printing the categories in the dataset for the first 25 instances

+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|categories                                                                                                                                                 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries                                                                                                      |
|Burgers, Fast Food, Sandwiches, Food, Ice Cream & Frozen Yogurt, Restaurants                                                                               |
|Pubs, Restaurants, Italian, Bars, American (Traditional), Nightlife, Greek                                                                                 |
|Ice Cream & Frozen Yogurt, Fast Food, Burgers, Rest

In [None]:
###############
## USER FILE ##
###############

In [66]:
path = 'data/yelp_academic_dataset_user.json'
df_user = spark.read.json(path)

In [68]:
# Print the number of rows and columns
print((df_user.count(), len(df_user.columns)))

(1987897, 22)


In [69]:
#Printing the columns
df_user.columns

['average_stars',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool',
 'elite',
 'fans',
 'friends',
 'funny',
 'name',
 'review_count',
 'useful',
 'user_id',
 'yelping_since']

In [88]:
# Printing how many Null files each categorie have
for i in df_user.columns:
    print(i, df_user.filter(col(i).isNull()).count())

average_stars 0
compliment_cool 0
compliment_cute 0
compliment_funny 0
compliment_hot 0
compliment_list 0
compliment_more 0
compliment_note 0
compliment_photos 0
compliment_plain 0
compliment_profile 0
compliment_writer 0
cool 0
elite 0
fans 0
friends 0
funny 0
name 0
review_count 0
useful 0
user_id 0
yelping_since 0


In [None]:
#################
## REVIEW FILE ##
#################

In [70]:
path = 'data/yelp_academic_dataset_review.json'
df_review = spark.read.json(path)

In [71]:
# Print the number of rows and columns
print((df_review.count(), len(df_review.columns)))

(6990280, 9)


In [72]:
#Printing the columns
df_review.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id']

In [89]:
# Printing how many Null files each categorie have
for i in df_review.columns:
    print(i, df_review.filter(col(i).isNull()).count())

business_id 0
cool 0
date 0
funny 0
review_id 0
stars 0
text 0
useful 0
user_id 0
