In [None]:
import sys
sys.path.insert(0, '/Users/pyro/github/HiveHelper_on_PySpark/hhop') 
# for running .ipynb files anywhere outside of a current dir using the module hhop

from functools import reduce
from importlib import reload
import pandas as pd

from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W

import shutil, os, time # working with FS
from glob import glob
from shutil import copy2
from pathlib import Path

import hhop # custom module
from hhop import DFExtender, SchemaManager #main classes
import funs
from funs import read_table, write_table, union_all # useful functions
from spark_init import spark

In [None]:
display(spark)

### Creating a synth table from csv

In [None]:
df_src = spark.read.csv('./synth_data/table1.csv', header=True, sep=';')

In [None]:
df_src.show()

In [None]:
df_src.write.mode('overwrite').partitionBy('dt_part', 'group_part').saveAsTable('default.part_table_test1')
df_src.repartition(4).write.mode('overwrite').saveAsTable('default.nonpart_table_test1')

## Info about table

### Reading table from Hive

In [None]:
df = read_table('default.part_table_test1', verbose=True, cnt_files=True)

You can use the next shell command to get a number of parquet files in any subdirectory

In [None]:
!hdfs dfs -ls -R file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/part_table_test1/dt_part=2022-12-19 | grep '.parquet' | wc -l

**Whenever you get a DF from DFExtender do not forget to either**  
1. write to Hive using custom function (same as 2 method but with defaults) `write_table(df, table, ...)`
2. write to Hive using native Spark methods: `df.write.mode('overwrite').saveAsTable('schema.table')`
3. or cache DF like `df = df.cache()`


Otherwise Spark will read sources of this DF every time and it could be very time consuming.

### NULL checks

In [None]:
# DFExtender doesn't change DataFrame during initialization and returns it as is
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info()

In [None]:
# this method returns a DF sorted by count of nulls in selected columns in descending order
df_check_null = df_check.get_df_with_null(['var1', 'var2'])

In [None]:
df_check_null.show()

### Primary Key checks

In [None]:
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info()

In [None]:
df_check.df_duplicates_pk.show()

## Comparing tables

Sometimes you need to compare two tables based on its primary keys.    
This method does exactly that. It
1. joins two DFs
2. calculates statistics from `DFExtender.get_info()`
3. print statistics on joining two tables, errors on non-PK attributes
4. returns DF with errors for manual analysis

In [None]:
def write_synth_sample(name):
    (
        spark.read.csv(f'./synth_data/{name}.csv', header=True, sep=';')
        .write.mode('overwrite')
        .partitionBy('dt_part', 'group_part')
        .saveAsTable(f'default.{name}')
    )
    
write_synth_sample('table1_comp')  
write_synth_sample('table2_comp')

df = read_table('default.table1_comp', alias='main')
df_ref = read_table('default.table2_comp', alias='ref')

In [None]:
# # For testing DFs without common columns outside of PK
# df=df.select(['pk1', 'pk2'])
# df_ref=df_ref.select(['pk1', 'pk2'])

Instance of DFExtender is the **main DF**,   
DF in arguments is the **reference DF**

In [None]:
df_main = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_main.compare_tables(df_ref)

In [None]:
df_matching_errors = df_main.df_with_errors

In [None]:
# filter for rows that are "not in main table"
df_matching_errors.filter(col('is_joined_main').isNull())\
.count() 

In [None]:
# filter for rows that are "not in reference table"
df_matching_errors.filter(col('is_joined_ref').isNull())\
.count()

In [None]:
# filter for finding an exact difference in column
df_matching_errors.filter(col('var1_is_diff') == 1).select('var1_is_diff', 'var1_main', 'var1_ref').show()

## SchemaManager

This class provides an interface for analyzing how many tables in a schema don't have underlying folders or any data.  
Then you can drop empty or broken tables from the selected schema.

In [None]:
schema_name = 'popular_schema' # our chosen schema for inspection
spark.sql(f"drop database if exists {schema_name} cascade")
spark.sql(f'create database {schema_name}')
None

In [None]:
df_src_write = df_src.write.mode('overwrite')
df_src_write.partitionBy('dt_part', 'group_part').saveAsTable(f'{schema_name}.table1')
df_src_write.saveAsTable(f'{schema_name}.table2')
df_src_write.saveAsTable(f'{schema_name}.table3')
spark.sql(f"drop view if exists {schema_name}.my_view")
spark.sql(f"create view {schema_name}.my_view as select * from {schema_name}.table1")
None

We created 3 sample tables and 1 view:

1. table1 has dir + data. It **won't** be deleted.  
2. table2 has only root dir and no data. It **will** be deleted.  
3. table3 doesn't have any dir and data. It **will** be deleted.
4. my_view is a **view** and it is going to be **ignored**.

In [None]:
table2_path = './spark-warehouse/popular_schema.db/table2'
table3_path = './spark-warehouse/popular_schema.db/table3'

shutil.rmtree(table2_path, ignore_errors=True)
os.makedirs(table2_path, exist_ok=True)

shutil.rmtree(table3_path, ignore_errors=True)

In [None]:
spark.sql(f"show tables in {schema_name}").show(10, False)

In [None]:
popular_schema = SchemaManager('popular_schema')

In [None]:
popular_schema.find_empty_tables()

In [None]:
popular_schema.dict_of_tables

In [None]:
popular_schema.drop_empty_tables()
# errors are OK, because sometimes you need to remove folders, but data in Metastore stays the same

In [None]:
spark.sql(f"show tables in {schema_name}").show(10, False)

## Extra

### function `union_all`

This function allows making a union operation of any number of Spark DataFrames  
Requirements:
1. all DFs must have same columns
2. If `dfs` is a list, explode it like `*dfs` 

In [None]:
# generating list of 3 DataFrames (5 row count each)
list_dfs = []
values = [
        ("x","x"),
        ("x","y"),
        ("x",None),
        (None,"x"),
        (None,None),
    ]
columns = ['val1', 'val2']
for val1, val2 in ((1,1), (1, None), (None, 1)):
    df_test = spark.createDataFrame(values, columns)
    df_test = (
        df_test
        .withColumn('is_joined_main', F.lit(val1))
        .withColumn('is_joined_ref', F.lit(val2))
    )
    list_dfs.append(df_test)

In [None]:
print('count of 1 table:', list_dfs[0].count()) # this is going to be 5 * 3 = 15 after union_all()

In [None]:
print(len(list_dfs)) # 3 DFs in the list
list_dfs

In [None]:
df_from_union = union_all(*list_dfs).cache()

# union_all(list_dfs[0], list_dfs[1], list_dfs[2]) # equivalent
print('count of table after 3 unions:', df_from_union.count())
df_from_union

In [None]:
# this is exactly a filter in the script for comparing tables
dummy1, dummy2,val1,val2='is_joined_main','is_joined_ref','val1','val2'
cond_diff = f"""case when
                ({dummy1} is null or {dummy2} is null) 
                or
                ({val1} is null and {val2} is null)
                or 
                ({val1} = {val2})
                then 0
                else 1
            end"""

(
    df_from_union
    .withColumn('is_diff', F.expr(cond_diff))
    .show(100)
)

### reading from Hive

1. straight parquet files
2. using hive query

In [None]:
spark.read.parquet('/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/part_table_test1/dt_part=2022-12-15/*').count()

In [None]:
df_sql = spark.sql("select count(1) as cnt from default.part_table_test1 where dt_part='2022-12-15'")
df_sql.show()

### writing DataFrames to Hive

In [None]:
df_from_union_write = df_from_union.limit(2)

In [None]:
# 1
write_table(df_from_union_write, 'test_writing_1')

In [None]:
df_from_union_write.write.mode('overwrite').saveAsTable('default.test_writing_3')

In [None]:
write_table(df, 'hello_test3', partition_cols=['index', 'var1'])

In [None]:
read_table('default.hello_test3', verbose=1, cnt_files=True)

## Modification of code

1. read as you like, use DFExtender to get stats
2. use all methods from PySpark as usual (beware that PySpark methods return a DataFrame object, not DFExtender object)

Check out official documentation!
1. [pyspark.sql.DataFrame methods](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.html)
2. [PySpark functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html)

In [None]:
df = read_table('default.table1_comp', alias='main')
df_main = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)

In [None]:
print(df_main.__class__)

In [None]:
# apply PySpark method for DFExtender object
df_main_filter = df_main.filter(col('pk1').isNotNull())

In [None]:
print(df_main_filter.__class__) # the type of an object returns to Spark DataFrame

### Generating txt files for sending over email

In [None]:
py_files = glob('./hhop/*.py')
os.makedirs('./hhop/txt', exist_ok=True)

for file in py_files:
    filename = file.split('/')[-1]
#     copying
    full_path = f'./hhop/txt/{filename}'
    copy2(file, full_path)
#     renaming
    p = Path(full_path)
    p.rename(p.with_suffix('.txt'))

In [None]:
spark.stop()