In [31]:
from hhop import DFExtender, SchemaManager, spark, col, F
import shutil, os, time # working with FS

from funs import read_table, write_table_through_view # useful functions
from functools import reduce

Возможно, пригодятся 
1. .config("spark.hadoop.hive.exec.dynamic.partition", "true") \
2. .config("spark.hadoop.hive.exec.dynamic.partition.mode", "nonstrict") \

## Synth table

In [2]:
df_src = spark.read.csv('./synth_data/table1.csv', header=True, sep=';')

In [3]:
df_src.show()

+-----+----+----+------+--------+----------+----------+
|index| pk1| pk2|  var1|    var2|   dt_part|group_part|
+-----+----+----+------+--------+----------+----------+
|    1|key1|   1|value1|value2_1|2022-12-15|    group1|
|    2|key1|   1|value1|value2_1|2022-12-16|    group2|
|    3|key1|   2|value1|value2_1|2022-12-16|    group3|
|    4|key2|   2|  null|value2_1|2022-12-17|    group1|
|    5|key2|   3|value1|value2_1|2022-12-18|    group2|
|    6|key2|   4|value1|    null|2022-12-19|    group3|
|    7|key2|null|value1|value2_1|2022-12-19|    group4|
|    8|null|   4|value1|value2_1|2022-12-20|    group3|
|    9|null|null|value1|value2_1|2022-12-20|    group7|
+-----+----+----+------+--------+----------+----------+



In [4]:
df_src.write.mode('overwrite').partitionBy('dt_part', 'group_part').saveAsTable('default.part_table_test1')
df_src.write.mode('overwrite').saveAsTable('default.nonpart_table_test1')

                                                                                

## Info about table

### Reading table from Hive

In [5]:
df_hive = read_table('default.part_table_test1', verbose=True)

root
 |-- index: string (nullable = true)
 |-- pk1: string (nullable = true)
 |-- pk2: string (nullable = true)
 |-- var1: string (nullable = true)
 |-- var2: string (nullable = true)
 |-- dt_part: string (nullable = true)
 |-- group_part: string (nullable = true)

location: file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/part_table_test1
partition columns: ['dt_part', 'group_part']


In [6]:
df = DFExtender(df_hive, pk=['pk1', 'pk2'], verbose=True)

### sanity checks: PK, NULL

In [7]:
df_filled = df_hive.fillna({'pk1': 'default', 'pk2': 'default'})

In [8]:
df_filled.show()

+-----+-------+-------+------+--------+----------+----------+
|index|    pk1|    pk2|  var1|    var2|   dt_part|group_part|
+-----+-------+-------+------+--------+----------+----------+
|    1|   key1|      1|value1|value2_1|2022-12-15|    group1|
|    2|   key1|      1|value1|value2_1|2022-12-16|    group2|
|    3|   key1|      2|value1|value2_1|2022-12-16|    group3|
|    5|   key2|      3|value1|value2_1|2022-12-18|    group2|
|    7|   key2|default|value1|value2_1|2022-12-19|    group4|
|    8|default|      4|value1|value2_1|2022-12-20|    group3|
|    4|   key2|      2|  null|value2_1|2022-12-17|    group1|
|    9|default|default|value1|value2_1|2022-12-20|    group7|
|    6|   key2|      4|value1|    null|2022-12-19|    group3|
+-----+-------+-------+------+--------+----------+----------+



In [9]:
DFExtender(df_filled, ['pk1', 'pk2'], verbose=True).getInfo()

Count all:           9
Unique PK count:     8
PK with duplicates:  1

Null values in columns - {'column': [count NULL, share NULL]}:
{'var1': [1, 0.1111], 'var2': [1, 0.1111]}


In [10]:
DFExtender(df_hive, ['pk1', 'pk2'], verbose=True).getInfo()

Count all:           9
PK column 'pk1' contains empty values, be careful!
PK column 'pk2' contains empty values, be careful!
Unique PK count:     8
PK with duplicates:  1

Null values in columns - {'column': [count NULL, share NULL]}:
{'pk1': [2, 0.2222], 'pk2': [2, 0.2222], 'var1': [1, 0.1111], 'var2': [1, 0.1111]}


In [11]:
df.groupBy(['pk1', 'pk2']).count().show()

+----+----+-----+
| pk1| pk2|count|
+----+----+-----+
|null|null|    1|
|key1|   1|    2|
|key2|   2|    1|
|key2|   3|    1|
|key2|null|    1|
|key1|   2|    1|
|null|   4|    1|
|key2|   4|    1|
+----+----+-----+



## Comparing tables

In [12]:
df_ref_csv = spark.read.csv('./synth_data/table2.csv', header=True, sep=';')
(
    df_ref_csv.write.mode('overwrite')
    .partitionBy('dt_part', 'group_part')
    .saveAsTable('default.part_table_test2')
)

df = read_table('default.nonpart_table_test1', alias='main')
df_ref = read_table('default.part_table_test2', alias='ref')

In [13]:
df

DataFrame[index: string, pk1: string, pk2: string, var1: string, var2: string, dt_part: string, group_part: string]

In [14]:
df_ref

DataFrame[index: string, pk1: string, pk2: string, var1: string, var2: string, dt_part: string, group_part: string]

In [32]:
def add_column_is_diff(df, col):
    return df.withColumn(col+'_is_diff', F.when(df1[col] != df2[col], 1).otherwise(0))

In [43]:
df1_cols = set(df.columns)
df2_cols = set(df_ref.columns)
df1 = df.withColumn('hhop_const_value_column', F.lit(1))
df2 = df_ref.withColumn('hhop_const_value_column', F.lit(1))
key = ['pk1', 'pk2']

df_temp = (
    df1
    .join(df2, on=key, how='full')
)

cols = df1_cols - set(key)
df_temp = reduce(add_column_is_diff, cols, df_temp)
df_temp.show(100, 5)

+----+----+-----+-----+-----+-------+----------+-----------------------+-----+-----+-----+-------+----------+-----------------------+------------+------------+---------------+------------------+-------------+
| pk1| pk2|index| var1| var2|dt_part|group_part|hhop_const_value_column|index| var1| var2|dt_part|group_part|hhop_const_value_column|var1_is_diff|var2_is_diff|dt_part_is_diff|group_part_is_diff|index_is_diff|
+----+----+-----+-----+-----+-------+----------+-----------------------+-----+-----+-----+-------+----------+-----------------------+------------+------------+---------------+------------------+-------------+
|key1|  11| null| null| null|   null|      null|                   null|    2|va...|va...|  20...|     gr...|                      1|           0|           0|              0|                 0|            0|
|key5|   5| null| null| null|   null|      null|                   null|    9|va...|va...|  20...|     gr...|                      1|           0|           0|     

In [42]:
df1_cols - set(key)

{'dt_part', 'group_part', 'index', 'var1', 'var2'}

## SchemaManager

### dropping empty tables

In [12]:
schema_name = 'popular_schema'
spark.sql(f"drop database if exists {schema_name} cascade")
spark.sql(f'create database {schema_name}')

DataFrame[]

In [13]:
df_src_write = df_src.write.mode('overwrite')
df_src_write.partitionBy('dt_part', 'group_part').saveAsTable(f'{schema_name}.table1')
df_src_write.saveAsTable(f'{schema_name}.table2')
df_src_write.saveAsTable(f'{schema_name}.table3')

table1 has dir + data   
table2 has only root dir    
table3 doesn't have any dir and data

In [14]:
shutil.rmtree('./spark-warehouse/popular_schema.db/table2', ignore_errors=True)
os.makedirs('./spark-warehouse/popular_schema.db/table2')
shutil.rmtree('./spark-warehouse/popular_schema.db/table3', ignore_errors=True)

In [15]:
spark.sql(f"show tables in {schema_name}").show(10, False)

+--------------+---------+-----------+
|database      |tableName|isTemporary|
+--------------+---------+-----------+
|popular_schema|table1   |false      |
|popular_schema|table2   |false      |
|popular_schema|table3   |false      |
+--------------+---------+-----------+



In [16]:
popular_schema = SchemaManager('popular_schema')

[Stage 70:>                                                         (0 + 3) / 3]

3 tables in popular_schema
run drop_empty_tables() to drop empty tables in popular_schema


                                                                                

In [17]:
popular_schema.drop_empty_tables()

2 going to be dropped out of 3 (66.67%). Schema: popular_schema
After dropping tables there are 1 tables in popular_schema


22/12/16 16:57:54 ERROR FileUtils: Failed to delete file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/popular_schema.db/table3


In [19]:
spark.sql(f"show tables in {schema_name}").show(10, False)

+--------------+---------+-----------+
|database      |tableName|isTemporary|
+--------------+---------+-----------+
|popular_schema|table1   |false      |
+--------------+---------+-----------+



## Extra

## Modification

1. read as you like, change class to DFExtender to get stats
2. use all methods from PySpark as usual