In [1]:
import sys
sys.path.insert(0, '/Users/pyro/github/HiveHelper_on_PySpark/hhop') 
# for running .ipynb files anywhere outside of a current dir using the module hhop

from functools import reduce
from importlib import reload
import pandas as pd

from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql.window import Window as W

import shutil, os, time # working with FS

import hhop # custom module
from hhop import DFExtender, SchemaManager #main classes
import funs
from funs import read_table, write_table, union_all # useful functions
from spark_init import spark

22/12/21 00:17:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Creating a synth table from csv

In [2]:
df_src = spark.read.csv('./synth_data/table1.csv', header=True, sep=';')

In [3]:
df_src.show()

+-----+----+----+------+--------+----------+----------+
|index| pk1| pk2|  var1|    var2|   dt_part|group_part|
+-----+----+----+------+--------+----------+----------+
|    1|key1|   1|value1|value2_1|2022-12-15|    group1|
|    2|key1|   1|value1|value2_1|2022-12-16|    group2|
|    3|key1|   2|value1|value2_1|2022-12-16|    group3|
|    4|key2|   2|  null|value2_1|2022-12-17|    group1|
|    5|key2|   3|value1|value2_1|2022-12-18|    group2|
|    6|key2|   4|value1|    null|2022-12-19|    group3|
|    7|key2|null|value1|value2_1|2022-12-19|    group4|
|    8|null|   4|value1|value2_1|2022-12-20|    group3|
|    9|null|null|value1|value2_1|2022-12-20|    group7|
+-----+----+----+------+--------+----------+----------+



In [4]:
df_src.write.mode('overwrite').partitionBy('dt_part', 'group_part').saveAsTable('default.part_table_test1')
df_src.repartition(4).write.mode('overwrite').saveAsTable('default.nonpart_table_test1')

                                                                                

## Info about table

### Reading table from Hive

In [5]:
df = read_table('default.part_table_test1', verbose=True, cnt_files=True)

root
 |-- index: string (nullable = true)
 |-- pk1: string (nullable = true)
 |-- pk2: string (nullable = true)
 |-- var1: string (nullable = true)
 |-- var2: string (nullable = true)
 |-- dt_part: string (nullable = true)
 |-- group_part: string (nullable = true)

partition columns: ['dt_part', 'group_part']


                                                                                

location: file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/part_table_test1
9 parquet files at the location


**Whenever you get a DF from DFExtender do not forget to either**  
1. write to Hive using custom function `write_table(df, table, ...)`
2. write to Hive using native Spark methods: `df.write.mode('overwrite').saveAsTable('schema.table')`
3. or cache DF like `df = df.cache()`


Otherwise Spark will read sources of this DF every time and it could be very time consuming.

### NULL checks

In [6]:
# DFExtender doesn't change DataFrame during initialization and returns it as is
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info()

You can access DF with PK duplicates in an attribute `.df_duplicates_pk`

Count all:                9
Unique PK count:          8
PK with duplicates:       1
PK column 'pk1' contains empty values, be careful!
PK column 'pk2' contains empty values, be careful!

Null values in columns - {'column': [count NULL, share NULL]}:
{'pk1': [2, 0.2222], 'pk2': [2, 0.2222], 'var1': [1, 0.1111], 'var2': [1, 0.1111]}


In [7]:
# this method returns a DF sorted by count of nulls in selected columns in descending order
df_check_null = df_check.get_df_with_null(['var1', 'var2'])

In [8]:
df_check_null.show()

+-----+----+---+------+--------+----------+----------+---------+
|index| pk1|pk2|  var1|    var2|   dt_part|group_part|cnt_nulls|
+-----+----+---+------+--------+----------+----------+---------+
|    6|key2|  4|value1|    null|2022-12-19|    group3|        1|
|    4|key2|  2|  null|value2_1|2022-12-17|    group1|        1|
+-----+----+---+------+--------+----------+----------+---------+



### PK checks

In [9]:
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info()

You can access DF with PK duplicates in an attribute `.df_duplicates_pk`

Count all:                9
Unique PK count:          8
PK with duplicates:       1
PK column 'pk1' contains empty values, be careful!
PK column 'pk2' contains empty values, be careful!

Null values in columns - {'column': [count NULL, share NULL]}:
{'pk1': [2, 0.2222], 'pk2': [2, 0.2222], 'var1': [1, 0.1111], 'var2': [1, 0.1111]}


In [10]:
df_check.df_duplicates_pk.show()

+-----+----+---+------+--------+----------+----------+------+
|index| pk1|pk2|  var1|    var2|   dt_part|group_part|cnt_pk|
+-----+----+---+------+--------+----------+----------+------+
|    1|key1|  1|value1|value2_1|2022-12-15|    group1|     2|
|    2|key1|  1|value1|value2_1|2022-12-16|    group2|     2|
+-----+----+---+------+--------+----------+----------+------+



In [11]:
df.groupBy(['pk1', 'pk2']).count().orderBy(col('count').desc()).show()

+----+----+-----+
| pk1| pk2|count|
+----+----+-----+
|key1|   1|    2|
|key2|   4|    1|
|key2|   3|    1|
|key2|null|    1|
|key2|   2|    1|
|null|   4|    1|
|key1|   2|    1|
|null|null|    1|
+----+----+-----+



## Comparing tables

Sometimes you need to compare two tables based on its primary keys.    
This method does exactly that. It
1. joins two DFs
2. calculates statistics from `DFExtender.get_info()`
3. print statistics on joining two tables, errors on non-PK attributes
4. returns DF with errors for manual analysis

In [12]:
def write_synth_sample(name):
    (
        spark.read.csv(f'./synth_data/{name}.csv', header=True, sep=';')
        .write.mode('overwrite')
        .partitionBy('dt_part', 'group_part')
        .saveAsTable(f'default.{name}')
    )
    
write_synth_sample('table1_comp')  
write_synth_sample('table2_comp')

df = read_table('default.table1_comp', alias='main')
df_ref = read_table('default.table2_comp', alias='ref')

In [13]:
# For testing DFs without common columns outside of PK
# df=df.select(['pk1', 'pk2'])
# df_ref=df_ref.select(['pk1', 'pk2'])

Instance of DFExtender is the **main DF**,   
DF in arguments is the **reference DF**

In [14]:
df_main = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_main.compare_tables(df_ref)

Main DF
Count all:                6
Unique PK count:          6
PK with duplicates:       0

Reference DF
Count all:                6
Unique PK count:          6
PK with duplicates:       0

Errors in columns - {'column': [count is_error, share is_error]}
{'var1': [2, 0.4], 'group_part': [2, 0.4], 'var2': [1, 0.2]}

Count stats of matching main and reference tables:
not in main table:        1
not in reference table:   1
correct matching:         5


In [15]:
df_matching_errors = df_main.df_with_errors

In [16]:
# filter for rows that are "not in main table"
df_matching_errors.filter(col('is_joined_main').isNull())\
.count() 

1

In [17]:
# filter for rows that are "not in reference table"
df_matching_errors.filter(col('is_joined_ref').isNull())\
.count()

1

In [18]:
# filter for finding an exact difference in column
df_matching_errors.filter(col('var1_is_diff') == 1).select('var1_is_diff', 'var1_main', 'var1_ref').show()

+------------+---------+--------------+
|var1_is_diff|var1_main|      var1_ref|
+------------+---------+--------------+
|           1|   value1|       value19|
|           1|     null|value_not_null|
+------------+---------+--------------+



## SchemaManager

This class provides an interface for analyzing how many tables in a schema don't have underlying folders or any data.  
Then you can drop empty or broken tables from the selected schema.

In [19]:
schema_name = 'popular_schema' # our chosen schema for inspection
spark.sql(f"drop database if exists {schema_name} cascade")
spark.sql(f'create database {schema_name}')
None

In [20]:
df_src_write = df_src.write.mode('overwrite')
df_src_write.partitionBy('dt_part', 'group_part').saveAsTable(f'{schema_name}.table1')
df_src_write.saveAsTable(f'{schema_name}.table2')
df_src_write.saveAsTable(f'{schema_name}.table3')

We created 3 sample tables:

1. table1 has dir + data. It **won't** be deleted.  
2. table2 has only root dir and no data. It **will** be deleted.  
3. table3 doesn't have any dir and data. It **will** be deleted.

In [21]:
table2_path = './spark-warehouse/popular_schema.db/table2'
table3_path = './spark-warehouse/popular_schema.db/table3'

shutil.rmtree(table2_path, ignore_errors=True)
os.makedirs(table2_path)

shutil.rmtree(table3_path, ignore_errors=True)

In [22]:
spark.sql(f"show tables in {schema_name}").show(10, False)

+--------------+---------+-----------+
|database      |tableName|isTemporary|
+--------------+---------+-----------+
|popular_schema|table1   |false      |
|popular_schema|table2   |false      |
|popular_schema|table3   |false      |
+--------------+---------+-----------+



In [23]:
popular_schema = SchemaManager('popular_schema')

3 tables in popular_schema
run find_empty_tables() on instance to find empty tables in popular_schema


In [24]:
popular_schema.find_empty_tables()

2 going to be dropped out of 3 (66.67%)
Data about tables stored in attribute '.dict_of_tables':
1 - has data, 0 - doesn't and going to be deleted

run drop_empty_tables() on instance to drop empty tables in popular_schema


In [25]:
popular_schema.dict_of_tables

{'table1': 1, 'table2': 0, 'table3': 0}

In [26]:
popular_schema.drop_empty_tables()
# errors are OK, because sometimes you need to remove folders, but data in Metastore stays the same

After dropping tables there are 1 tables in popular_schema


22/12/21 00:18:06 ERROR FileUtils: Failed to delete file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/popular_schema.db/table3


In [27]:
spark.sql(f"show tables in {schema_name}").show(10, False)

+--------------+---------+-----------+
|database      |tableName|isTemporary|
+--------------+---------+-----------+
|popular_schema|table1   |false      |
+--------------+---------+-----------+



## Extra

### function `union_all`

This function allows making a union operation of any number of Spark DataFrames  
Requirements:
1. all DFs must have same columns
2. If `dfs` is a list, explode it like `*dfs` 

In [28]:
# generating list of 3 DataFrames (5 row count each)
list_dfs = []
values = [
        ("x","x"),
        ("x","y"),
        ("x",None),
        (None,"x"),
        (None,None),
    ]
columns = ['val1', 'val2']
for val1, val2 in ((1,1), (1, None), (None, 1)):
    df_test = spark.createDataFrame(values, columns)
    df_test = (
        df_test
        .withColumn('is_joined_main', F.lit(val1))
        .withColumn('is_joined_ref', F.lit(val2))
    )
    list_dfs.append(df_test)

In [29]:
print('count of 1 table:', list_dfs[0].count()) # this is going to be 5 * 3 = 15 after union_all()

count of 1 table: 5


In [30]:
print(len(list_dfs)) # 3 DFs in the list
list_dfs

3


[DataFrame[val1: string, val2: string, is_joined_main: int, is_joined_ref: int],
 DataFrame[val1: string, val2: string, is_joined_main: int, is_joined_ref: null],
 DataFrame[val1: string, val2: string, is_joined_main: null, is_joined_ref: int]]

In [31]:
df_from_union = union_all(*list_dfs).cache()

# union_all(list_dfs[0], list_dfs[1], list_dfs[2]) # equivalent
print('count of table after 3 unions:', df_from_union.count())
df_from_union

count of table after 3 unions: 15


DataFrame[val1: string, val2: string, is_joined_main: int, is_joined_ref: int]

In [32]:
# this is exactly a filter in the script for comparing tables
dummy1, dummy2,val1,val2='is_joined_main','is_joined_ref','val1','val2'
cond_diff = f"""case when
                ({dummy1} is null or {dummy2} is null) 
                or
                ({val1} is null and {val2} is null)
                or 
                ({val1} = {val2})
                then 0
                else 1
            end"""

(
    df_from_union
    .withColumn('is_diff', F.expr(cond_diff))
    .show(100)
)

+----+----+--------------+-------------+-------+
|val1|val2|is_joined_main|is_joined_ref|is_diff|
+----+----+--------------+-------------+-------+
|   x|   x|             1|            1|      0|
|   x|   y|             1|            1|      1|
|   x|null|             1|            1|      1|
|null|   x|             1|            1|      1|
|null|null|             1|            1|      0|
|   x|   x|             1|         null|      0|
|   x|   y|             1|         null|      0|
|   x|null|             1|         null|      0|
|null|   x|             1|         null|      0|
|null|null|             1|         null|      0|
|   x|   x|          null|            1|      0|
|   x|   y|          null|            1|      0|
|   x|null|          null|            1|      0|
|null|   x|          null|            1|      0|
|null|null|          null|            1|      0|
+----+----+--------------+-------------+-------+



### reading from Hive

1. straight parquet files
2. using hive query

In [33]:
spark.read.parquet('/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/part_table_test1/dt_part=2022-12-15/*').count()

1

In [34]:
spark.sql("select count(1) from default.part_table_test1 where dt_part='2022-12-15'").show()

+--------+
|count(1)|
+--------+
|       1|
+--------+



### writing DataFrames to Hive

needs refining

In [35]:
df_from_union_write = df_from_union.limit(2)

In [36]:
# 1
write_table(df_from_union_write, 'test_writing_1')

DF saved as default.test_writing_1


In [37]:
df_from_union_write.write.mode('overwrite').saveAsTable('default.test_writing_3')

In [38]:
write_table(df, 'hello_test3', partition_cols=['index', 'var1'])

DF saved as default.hello_test3


In [39]:
read_table('default.hello_test3', verbose=1, cnt_files=1)

root
 |-- pk1: string (nullable = true)
 |-- pk2: string (nullable = true)
 |-- var2: string (nullable = true)
 |-- dt_part: string (nullable = true)
 |-- group_part: string (nullable = true)
 |-- index: string (nullable = true)
 |-- var1: string (nullable = true)

partition columns: ['index', 'var1']
location: file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/hello_test3
6 parquet files at the location


DataFrame[pk1: string, pk2: string, var2: string, dt_part: string, group_part: string, index: string, var1: string]

## Modification of code

1. read as you like, use DFExtender to get stats
2. use all methods from PySpark as usual (beware that PySpark methods return a DataFrame object, not DFExtender object)

Check out official documentation!
1. [pyspark.sql.DataFrame methods](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.html)
2. [PySpark functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html)

In [40]:
df = read_table('default.table1_comp', alias='main')
df_main = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)

In [41]:
print(df_main.__class__)

<class 'hhop.DFExtender'>


In [42]:
# apply PySpark method for DFExtender object
df_main_filter = df_main.filter(col('pk1').isNotNull())

In [43]:
print(df_main_filter.__class__) # the type of an object returns to Spark DataFrame

<class 'pyspark.sql.dataframe.DataFrame'>


In [44]:
spark.stop()