In [14]:
from pyspark.sql import *
from pyspark.sql.functions import *
import logging

spark = SparkSession.builder.master('local').appName('app').getOrCreate()

class pipeline:
    """
    input: path of input data folder
    output: path of output processed data folder
    """
    def __init__(self,input,output):
        
        self.input = input
        self.output = output


    def business_transform(self):
        
        """
            Reads business dataset, cleans data and stores processed files in output destination
        """
        logging.info('Reading business dataset')
        df_b = spark.read.json(self.input+"yelp_academic_dataset_business.json")

        #take RestaurantsPriceRange2 value from attributes column :
        df_b = df_b.withColumn("price_range", col("attributes").getField("RestaurantsPriceRange2"))
        df_b = df_b.na.drop() #drop null values 
        df_b = df_b.filter(col("categories").contains("Restaurants")) #filter out non restaurant businesses
        df_b = df_b.drop("attributes") #drop attribute column
        logging.info('Saving business dataset')
        df_b.write.format("parquet").mode("overwrite").save(self.output + "business.parquet")

    def review_transform(self):
        
        """
            Reads review dataset, cleans data and stores processed files in output destination
        """
        logging.info('Reading Review Dataset')
        df_r = spark.read.json(self.input+"yelp_academic_dataset_review.json")
        df_r = df_r.na.drop()
        logging.info('Saving review dataset')
        df_r.write.format("parquet").mode("overwrite").save(self.output + "review.parquet")

    def user_transform(self):

        """
            Reads user dataset, cleans data and stores processed files in output destination
        """
        logging.info('Reading user dataset')
        df_u = spark.read.json(self.input+"yelp_academic_dataset_user.json")
        df_u = df_u.na.drop()
        logging.info('saving user dataset...')
        df_u.write.format("parquet").mode("overwrite").save(self.output + "user.parquet")
    
    def checkin_transform(self):

        """
            Reads checkin dataset, cleans data and stores processed files in output destination
        """

        logging.info("Reading checkin dataset")
        df_c = spark.read.json(self.input+"yelp_academic_dataset_checkin.json")
        df_c = df_c.withColumn("no_of_checkins",size(split(col("date"),",")))
        df_c = df_c.drop("date")
        logging.info('saving checkin dataset')
        df_c.write.format("parquet").mode("overwrite").save(self.output + "checkin.parquet")

    def tip_transform(self):
        
        """
            Reads tip dataset, cleans data and stores processed files in output destination
        """
        logging.info('Reading tip dataset')
        df_t = spark.read.json(self.input+"yelp_academic_dataset_tip.json")
        df_t = df_t.na.drop()
        logging.info('saving tip dataset...')
        df_t.write.format("parquet").mode("overwrite").save(self.output + "tip.parquet")






In [16]:
class Main:
    """
        Args:
            source: path of Azure blob storage containing datasets
            output: path of Azure blob storage to store processed parquet files after transforming+cleaning
    """
    def __init__(self):
        logging.basicConfig(filename='pipeline.log', level=logging.DEBUG)
        path = "/HdiNotebooks/"
        ETL_data = pipeline(path, path)
        ETL_data.business_transform()
        ETL_data.review_transform()
        ETL_data.user_transform()
        ETL_data.checkin_transform()
        ETL_data.tip_transform()


Main()

    

An error was encountered:
Session 0 did not reach idle status in time. Current status is busy.
