In [58]:
#library import
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import *
spark=SparkSession.builder.master("local").appName("parquet_to_unified").getOrCreate()

In [71]:
#script execution for raw data to landing
class parquet_to_unified:
    
    """Script to read parquet file from local path to unified.."""
    
    def __init__(self,file_parameter):
        
        """function to get input parameters to read the file from local path"""
        self.file_parameter=file_parameter
        
        """read parquet data from local"""
    def read_parquet(self,fileformat,file_path):
        return spark.read.parquet(file_path)

        """save file as parquet and also save as table in local db"""
    def write_parquet(self,df,dest_path,table_name):   
        df.write.mode("overwrite").parquet(dest_path)
        df.coalesce(1).write.format(self.file_parameter['parquet']).mode("overwrite").option("path",dest_path)\
        .saveAsTable("school_unified.{0}".format(table_name))
        
        
        """creating temporary view to add audit columns- created timestamp,creator name,updator name"""
    def create_view(self,df1,df2,df3):
        marks_schema = ["exam_date" ,"marks","student_id","subject_id",\
                  "marks_audit_created_username", "marks_audit_created_timestamp",
                   "marks_audit_updated_username", "marks_audit_updated_timestamp"]
        student_schema = ["student_name" ,"student_id", \
                  "student_audit_created_username", "student_audit_created_timestamp",
                   "student_audit_updated_username", "student_audit_updated_timestamp"]
            
        subject_schema = ["subject_name" ,"subject_id",\
                  "subject_audit_created_username", "subject_audit_created_timestamp",
                   "subject_audit_updated_username", "subject_audit_updated_timestamp"]
            
        updated_marks_df= df1.toDF(*marks_schema)
        updated_student_df= df2.toDF(*student_schema)
        updated_subject_df= df3.toDF(*subject_schema)

        unified_df = updated_marks_df.join(updated_student_df,['student_id']).join(updated_subject_df,['subject_id'])
        print((df1.count(), len(df1.columns)))
        print((unified_df.count(), len(unified_df.columns)))
        unified_df.printSchema()
        unified_df.show(unified_df.count(),truncate = False)
        return unified_df
    
    def main_block(self):
        
        """main execution block"""
        spark.sql("create database if not exists {0}".format(self.file_parameter['local_db_name']))
        marks_df=self.read_parquet(self.file_parameter['parquet_format'],self.file_parameter['src_base_read_path']+self.file_parameter['read_filename_marks']+self.file_parameter['parquet_format']);
        student_df=self.read_parquet(self.file_parameter['parquet_format'],self.file_parameter['src_base_read_path']+self.file_parameter['read_filename_student']+self.file_parameter['parquet_format']);
        subject_df=self.read_parquet(self.file_parameter['parquet_format'],self.file_parameter['src_base_read_path']+self.file_parameter['read_filename_subject']+self.file_parameter['parquet_format']);
        parquet_df1 = self.create_view(marks_df,student_df,subject_df);
        self.write_parquet(parquet_df1,self.file_parameter['dest_base_write_path']+self.file_parameter['write_merged_df'],self.file_parameter['table_name_school_unified']);


In [72]:
if __name__=='__main__':
    
    file_read={
        "csv_format":".csv",
        "csv":"csv",
        "parquet_format":".parquet",
        "parquet":"parquet",
        "table_name_school_unified":"unified_student_details",
        "table_name_students":"students",
        "table_name_subject":"subjects",
        "read_filename_marks":"/marks/markdetails",
        "read_filename_student":"/student/studentdetails",
        "read_filename_subject":"/subject/subjectdetails",
        "write_merged_df":"/school_unified/students_details.parquet",
        "dest_base_write_path":"../../data_files/unified",
        "src_base_read_path":"../../data_files/landing/school_landing",
        "temp_table":"school_table",
        "local_db_name":"school_unified",
    }

    unified_obj1=parquet_to_unified(file_read)
    unified_obj1.main_block()
   

(312, 8)
(226, 18)
root
 |-- subject_id: string (nullable = true)
 |-- student_id: string (nullable = true)
 |-- exam_date: string (nullable = true)
 |-- marks: string (nullable = true)
 |-- marks_audit_created_username: string (nullable = true)
 |-- marks_audit_created_timestamp: timestamp (nullable = true)
 |-- marks_audit_updated_username: string (nullable = true)
 |-- marks_audit_updated_timestamp: timestamp (nullable = true)
 |-- student_name: string (nullable = true)
 |-- student_audit_created_username: string (nullable = true)
 |-- student_audit_created_timestamp: timestamp (nullable = true)
 |-- student_audit_updated_username: string (nullable = true)
 |-- student_audit_updated_timestamp: timestamp (nullable = true)
 |-- subject_name: string (nullable = true)
 |-- subject_audit_created_username: string (nullable = true)
 |-- subject_audit_created_timestamp: timestamp (nullable = true)
 |-- subject_audit_updated_username: string (nullable = true)
 |-- subject_audit_updated_times

22/02/07 22:34:14 WARN HadoopFSUtils: The directory file:/Users/vasanth_ku/Vasanth/spark-apache/git%20projects/big_data_project_git/etl_scripts/unified/data_files/unified/school_unified/students_details.parquet was not found. Was it deleted very recently?
