In [1]:
import sys
sys.path.insert(0, '/Users/pyro/github/HiveHelper_on_PySpark/hhop') 
# for running .ipynb files anywhere outside of a current dir using the module hhop

from functools import reduce
from importlib import reload
import pandas as pd

from pyspark.sql.functions import col
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window as W

import shutil, os, time # working with FS
from glob import glob
from shutil import copy2
from pathlib import Path

custom_spark_params = {
    'app_name': 'custom_app_name123',
}
from pass_spark_config import write_spark_config
write_spark_config(custom_spark_params)


from hhop import DFExtender, SchemaManager, TablePartitionDescriber #main classes
from funs import read_table, write_table, write_read_table, union_all, deduplicate_df # useful functions
from spark_init import spark
from exceptions import HhopException
display(spark)

23/07/09 02:31:25 WARN Utils: Your hostname, Pavels-MacBook-Air.local resolves to a loopback address: 127.0.0.200; using 192.168.0.103 instead (on interface en0)
23/07/09 02:31:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/07/09 02:31:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Creating a synth table from csv

In [2]:
df_src = spark.read.csv('./synth_data/table1.csv', header=True, sep=';')

In [3]:
df_src = df_src.fillna('', subset=['var1'])

In [4]:
df_src.show()

+-----+----+----+------+--------+----------+----------+
|index| pk1| pk2|  var1|    var2|   dt_part|group_part|
+-----+----+----+------+--------+----------+----------+
|    1|key1|   1|value1|value2_1|2022-12-15|    group1|
|    2|key1|   1|value1|value2_1|2022-12-16|    group2|
|    3|key1|   2|value1|value2_1|2022-12-16|    group3|
|    4|key2|   2|      |value2_1|2022-12-17|    group1|
|    5|key2|   3|value1|value2_1|2022-12-18|    group2|
|    6|key2|   4|value1|    null|2022-12-19|    group3|
|    7|key2|null|value1|value2_1|2022-12-19|    group4|
|    8|null|   4|value1|value2_1|2022-12-20|    group3|
|    9|null|null|value1|value2_1|2022-12-20|    group7|
+-----+----+----+------+--------+----------+----------+



In [5]:
df_src.write.mode('overwrite').partitionBy('dt_part', 'group_part').saveAsTable('default.part_table_test1')
df_src.repartition(4).write.mode('overwrite').saveAsTable('default.nonpart_table_test1')

## Info about table

### Reading table from Hive

In [6]:
df = read_table('default.part_table_test1', verbose=True, cnt_files=True)

root
 |-- index: string (nullable = true)
 |-- pk1: string (nullable = true)
 |-- pk2: string (nullable = true)
 |-- var1: string (nullable = true)
 |-- var2: string (nullable = true)
 |-- dt_part: string (nullable = true)
 |-- group_part: string (nullable = true)

partition columns: ['dt_part', 'group_part']
Running command: hdfs dfs -ls -R file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/part_table_test1 | grep '.parquet' | wc -l
0 parquet files in the specified above location


You can use the next shell command to get a number of parquet files in any subdirectory

In [7]:
!hdfs dfs -ls -R file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/part_table_test1/dt_part=2022-12-19 | grep '.parquet' | wc -l

       0


**Whenever you get a DF from DFExtender do not forget to either**  
1. write to Hive using custom function (same as 2 method but with defaults) `write_table(df, table, ...)`
2. write to Hive using native Spark methods: `df.write.mode('overwrite').saveAsTable('schema.table')`
3. or cache DF like `df = df.cache()`


Otherwise Spark will read sources of this DF every time and it could be very time consuming.

### NULL checks

In [8]:
# DFExtender doesn't change DataFrame during initialization and returns it as is
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info()

You can access DF with PK duplicates in an attribute `.df_duplicates_pk`

Count all:                9
Unique PK count:          8
PK with duplicates:       1

Null values in columns - {'column': [count NULL, share NULL]}:
{'pk1': [2, 0.2222], 'pk2': [2, 0.2222], 'var1': [1, 0.1111], 'var2': [1, 0.1111]}
Use method `.get_df_with_null(List[str])` to get a df with specified NULL columns
PK column 'pk1' contains empty values, be careful!
PK column 'pk2' contains empty values, be careful!


In [9]:
# this method returns a DF sorted by count of nulls in selected columns in descending order
df_check_null = df_check.get_df_with_null(['var1', 'var2'])

In [10]:
df_check_null.show()

+-----+----+---+------+--------+----------+----------+---------+
|index| pk1|pk2|  var1|    var2|   dt_part|group_part|cnt_nulls|
+-----+----+---+------+--------+----------+----------+---------+
|    6|key2|  4|value1|    null|2022-12-19|    group3|        1|
|    4|key2|  2|      |value2_1|2022-12-17|    group1|        1|
+-----+----+---+------+--------+----------+----------+---------+



### Primary Key checks

In [11]:
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info()

You can access DF with PK duplicates in an attribute `.df_duplicates_pk`

Count all:                9
Unique PK count:          8
PK with duplicates:       1

Null values in columns - {'column': [count NULL, share NULL]}:
{'pk1': [2, 0.2222], 'pk2': [2, 0.2222], 'var1': [1, 0.1111], 'var2': [1, 0.1111]}
Use method `.get_df_with_null(List[str])` to get a df with specified NULL columns
PK column 'pk1' contains empty values, be careful!
PK column 'pk2' contains empty values, be careful!


In [12]:
df_check.df_duplicates_pk.show()

+-----+----+---+------+--------+----------+----------+------+
|index| pk1|pk2|  var1|    var2|   dt_part|group_part|cnt_pk|
+-----+----+---+------+--------+----------+----------+------+
|    1|key1|  1|value1|value2_1|2022-12-15|    group1|     2|
|    2|key1|  1|value1|value2_1|2022-12-16|    group2|     2|
+-----+----+---+------+--------+----------+----------+------+



## Comparing tables

Sometimes you need to compare two tables based on its primary keys.    
This method does exactly that. It
1. joins two DFs
2. calculates statistics from `DFExtender.get_info()`
3. print statistics on joining two tables, errors on non-PK attributes
4. returns DF with errors for manual analysis

In [13]:
def write_synth_sample(name):
    (
        spark.read.csv(f'./synth_data/{name}.csv', header=True, sep=';')
        .write.mode('overwrite')
        .partitionBy('dt_part', 'group_part')
        .saveAsTable(f'default.{name}')
    )
    
write_synth_sample('table1_comp')  
write_synth_sample('table2_comp')

df = read_table('default.table1_comp', alias='main')
df_ref = read_table('default.table2_comp', alias='ref')

In [14]:
# # For testing DFs without common columns outside of PK
# df=df.select(['pk1', 'pk2'])
# df_ref=df_ref.select(['pk1', 'pk2'])

Instance of DFExtender is the **main DF**,   
DF in arguments is the **reference DF**

In [15]:
df_main = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_main.compare_tables(df_ref)

Main DF
Count all:                6
Unique PK count:          6
PK with duplicates:       0

Reference DF
Count all:                6
Unique PK count:          6
PK with duplicates:       0

Errors in columns - {'column': [count is_error, share is_error]}
{'var1': [2, 0.4], 'group_part': [2, 0.4], 'var2': [1, 0.2]}

Count stats of matching main and reference tables:
not in main table:        1
not in reference table:   1
correct matching:         5

Use DF in attribute `.df_with_errors` for further analysis
You can find alternative order of columns in attr .columns_diff_reordered_all


**You can get results in native Python data types**

In [16]:
df_main.dict_cols_with_errors

{'var1': [2, 0.4], 'group_part': [2, 0.4], 'var2': [1, 0.2]}

In [17]:
df_main.matching_results

[1, 1, 5]

In [18]:
df_matching_errors = df_main.df_with_errors

In [19]:
# filter for rows that are "not in main table"
df_matching_errors.filter(col('is_joined_main').isNull())\
.count() 

1

In [20]:
(
    df_matching_errors
    .filter(col('is_joined_main').isNull())
    .select('pk1', 'pk2', 'is_joined_main', 'is_joined_ref', 'var1_main', 'var1_ref')
).show()

+----+---+--------------+-------------+---------+--------+
| pk1|pk2|is_joined_main|is_joined_ref|var1_main|var1_ref|
+----+---+--------------+-------------+---------+--------+
|key2|  5|          null|            1|     null|  value1|
+----+---+--------------+-------------+---------+--------+



In [21]:
# filter for rows that are "not in reference table"
df_matching_errors.filter(col('is_joined_ref').isNull())\
.count()

1

In [22]:
# filter for finding an exact difference in column
df_matching_errors.filter(col('var1_is_diff') == 1).select('var1_is_diff', 'var1_main', 'var1_ref').show()

+------------+---------+--------------+
|var1_is_diff|var1_main|      var1_ref|
+------------+---------+--------------+
|           1|   value1|       value19|
|           1|     null|value_not_null|
+------------+---------+--------------+



### Alternative order of columns in `df_with_errors`

In [23]:
alt_order_cols = df_main.columns_diff_reordered_all

In [24]:
(
    df_matching_errors
    .select(*alt_order_cols)
    .filter(col('var1_is_diff') == 1)
).show()

+----+---+--------------+-------------+------------+-----------+---------------+----------+---------+-------------+---------+--------+------------+---------+--------------+------------+---------------+--------------+------------------+
| pk1|pk2|is_joined_main|is_joined_ref|dt_part_main|dt_part_ref|dt_part_is_diff|index_main|index_ref|index_is_diff|var2_main|var2_ref|var2_is_diff|var1_main|      var1_ref|var1_is_diff|group_part_main|group_part_ref|group_part_is_diff|
+----+---+--------------+-------------+------------+-----------+---------------+----------+---------+-------------+---------+--------+------------+---------+--------------+------------+---------------+--------------+------------------+
|key1|  1|             1|            1|  2022-12-15| 2022-12-15|              0|         1|        1|            0| value2_1|value2_1|           0|   value1|       value19|           1|         group2|        group7|                 1|
|key2|  1|             1|            1|  2022-12-17| 202

## TablePartitionDescriber
The class helps to get partitions of partitioned Hive table
    in a readable and ready-to-use format


How **default format** looks:

In [25]:
table_to_analyze_partitions = 'default.part_table_test1'

In [26]:
spark.sql(f"show partitions {table_to_analyze_partitions}").show(10, False)

+------------------------------------+
|partition                           |
+------------------------------------+
|dt_part=2022-12-15/group_part=group1|
|dt_part=2022-12-16/group_part=group2|
|dt_part=2022-12-16/group_part=group3|
|dt_part=2022-12-17/group_part=group1|
|dt_part=2022-12-18/group_part=group2|
|dt_part=2022-12-19/group_part=group3|
|dt_part=2022-12-19/group_part=group4|
|dt_part=2022-12-20/group_part=group3|
|dt_part=2022-12-20/group_part=group7|
+------------------------------------+



How you can get partitions from **this class**:

In [27]:
table_partitions = TablePartitionDescriber('default.part_table_test1')

In [28]:
table_partitions_got = table_partitions.get_partitions_parsed()
table_partitions_got.show(100, False)

+----------+----------+
|dt_part   |group_part|
+----------+----------+
|2022-12-15|group1    |
|2022-12-16|group2    |
|2022-12-16|group3    |
|2022-12-17|group1    |
|2022-12-18|group2    |
|2022-12-19|group3    |
|2022-12-19|group4    |
|2022-12-20|group3    |
|2022-12-20|group7    |
+----------+----------+



You can find max value from partitions in a particular column

In [29]:
max_dt = table_partitions.get_max_value_from_partitions('dt_part')
max_dt

'2022-12-20'

And apply prefilter to other partitioned columns in case you need to. It is just a shortcut.

In [30]:
prefilter = col('group_part') == 'group1'
max_dt_group = table_partitions.get_max_value_from_partitions('dt_part', prefilter=prefilter)
max_dt_group

'2022-12-17'

## SchemaManager

This class provides an interface for analyzing how many tables in a schema don't have underlying folders or any data.  
Then you can drop empty or broken tables from the selected schema.

In [31]:
schema_name = 'popular_schema' # our chosen schema for inspection
spark.sql(f"drop database if exists {schema_name} cascade")
spark.sql(f'create database {schema_name}')
None

In [32]:
df_src_write = df_src.write.mode('overwrite')
df_src_write.partitionBy('dt_part', 'group_part').saveAsTable(f'{schema_name}.table1')
df_src_write.saveAsTable(f'{schema_name}.table2')
df_src_write.saveAsTable(f'{schema_name}.table3')
spark.sql(f"drop view if exists {schema_name}.my_view")
spark.sql(f"create view {schema_name}.my_view as select * from {schema_name}.table1")
None

We created 3 sample tables and 1 view:

1. table1 has dir + data. It **won't** be deleted.  
2. table2 has only root dir and no data. It **will** be deleted.  
3. table3 doesn't have any dir and data. It **will** be deleted.
4. my_view is a **view** and it is going to be **ignored**.

In [33]:
table2_path = './spark-warehouse/popular_schema.db/table2'
table3_path = './spark-warehouse/popular_schema.db/table3'

shutil.rmtree(table2_path, ignore_errors=True)
os.makedirs(table2_path, exist_ok=True)

shutil.rmtree(table3_path, ignore_errors=True)

In [34]:
spark.sql(f"show tables in {schema_name}").show(10, False)

+--------------+---------+-----------+
|database      |tableName|isTemporary|
+--------------+---------+-----------+
|popular_schema|my_view  |false      |
|popular_schema|table1   |false      |
|popular_schema|table2   |false      |
|popular_schema|table3   |false      |
+--------------+---------+-----------+



In [35]:
popular_schema = SchemaManager('popular_schema')

3 tables in popular_schema
run find_empty_tables() on instance to find empty tables in popular_schema


In [36]:
popular_schema.find_empty_tables()

2 tables going to be dropped out of 3 (66.67%)Data about tables is stored in an attribute '.dict_of_tables':
1 - has data, 0 - doesn't and going to be deleted

run drop_empty_tables() on instance to drop empty tables in popular_schema


In [37]:
popular_schema.dict_of_tables

{'table2': 0, 'table3': 0, 'table1': 1}

In [38]:
popular_schema.drop_empty_tables()
# errors are OK, because sometimes you need to remove folders, but data in Metastore stays the same

23/07/09 02:31:49 ERROR FileUtils: Failed to delete file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/popular_schema.db/table3


After dropping tables there are 1 tables in popular_schema


In [39]:
spark.sql(f"show tables in {schema_name}").show(10, False)

+--------------+---------+-----------+
|database      |tableName|isTemporary|
+--------------+---------+-----------+
|popular_schema|my_view  |false      |
|popular_schema|table1   |false      |
+--------------+---------+-----------+



## Extra

### function `union_all`

This function allows making a union operation of any number of Spark DataFrames  
Requirements:
1. all DFs must have same columns
2. If `dfs` is a list, explode it like `*dfs` 

In [40]:
# generating list of 3 DataFrames (5 row count each)
list_dfs = []
values = [
        ("x","x"),
        ("x","y"),
        ("x",None),
        (None,"x"),
        (None,None),
    ]
columns = ['val1', 'val2']
for val1, val2 in ((1,1), (1, None), (None, 1)):
    df_test = spark.createDataFrame(values, columns)
    df_test = (
        df_test
        .withColumn('is_joined_main', F.lit(val1))
        .withColumn('is_joined_ref', F.lit(val2))
    )
    list_dfs.append(df_test)

In [41]:
print('count of 1 table:', list_dfs[0].count()) # this is going to be 5 * 3 = 15 after union_all()

count of 1 table: 5


In [42]:
print(len(list_dfs)) # 3 DFs in the list
list_dfs

3


[DataFrame[val1: string, val2: string, is_joined_main: int, is_joined_ref: int],
 DataFrame[val1: string, val2: string, is_joined_main: int, is_joined_ref: null],
 DataFrame[val1: string, val2: string, is_joined_main: null, is_joined_ref: int]]

In [43]:
df_from_union = union_all(*list_dfs).cache()

# union_all(list_dfs[0], list_dfs[1], list_dfs[2]) # equivalent
print('count of table after 3 unions:', df_from_union.count())
df_from_union

count of table after 3 unions: 15


DataFrame[val1: string, val2: string, is_joined_main: int, is_joined_ref: int]

In [44]:
# this is exactly a filter in the script for comparing tables
dummy1, dummy2,val1,val2='is_joined_main','is_joined_ref','val1','val2'
cond_diff = f"""case when
                ({dummy1} is null or {dummy2} is null) 
                or
                ({val1} is null and {val2} is null)
                or 
                ({val1} = {val2})
                then 0
                else 1
            end"""

(
    df_from_union
    .withColumn('is_diff', F.expr(cond_diff))
    .show(100)
)

+----+----+--------------+-------------+-------+
|val1|val2|is_joined_main|is_joined_ref|is_diff|
+----+----+--------------+-------------+-------+
|   x|   x|             1|            1|      0|
|   x|   y|             1|            1|      1|
|   x|null|             1|            1|      1|
|null|   x|             1|            1|      1|
|null|null|             1|            1|      0|
|   x|   x|             1|         null|      0|
|   x|   y|             1|         null|      0|
|   x|null|             1|         null|      0|
|null|   x|             1|         null|      0|
|null|null|             1|         null|      0|
|   x|   x|          null|            1|      0|
|   x|   y|          null|            1|      0|
|   x|null|          null|            1|      0|
|null|   x|          null|            1|      0|
|null|null|          null|            1|      0|
+----+----+--------------+-------------+-------+



### reading from Hive

1. straight parquet files
2. using hive query

In [45]:
spark.read.parquet('/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/part_table_test1/dt_part=2022-12-15/*').count()

1

In [46]:
df_sql = spark.sql("select count(1) as cnt from default.part_table_test1 where dt_part='2022-12-15'")
df_sql.show()

+---+
|cnt|
+---+
|  1|
+---+



### writing DataFrames to Hive

In [47]:
# 1
df.coalesce(1).write.partitionBy(['index', 'var1']).mode('overwrite').saveAsTable('default.test_writing_1')

In [48]:
# 2 (same as 1)
# but everything is optional except DF and name of the table
write_table(
    df.coalesce(1), 'test_writing_2', 
    schema='default', 
    partition_cols=['index', 'var1'], 
    mode='overwrite', 
    format_files='parquet'
)

DF saved as default.test_writing_2


In [49]:
read_table('default.test_writing_1', verbose=1, cnt_files=True)
None

root
 |-- pk1: string (nullable = true)
 |-- pk2: string (nullable = true)
 |-- var2: string (nullable = true)
 |-- dt_part: string (nullable = true)
 |-- group_part: string (nullable = true)
 |-- index: string (nullable = true)
 |-- var1: string (nullable = true)

partition columns: ['index', 'var1']
Running command: hdfs dfs -ls -R file:/Users/pyro/github/HiveHelper_on_PySpark/spark-warehouse/test_writing_1 | grep '.parquet' | wc -l
0 parquet files in the specified above location


### Deduplication of DF using `deduplicate_df`

In [50]:
df.show()

+-----+----+---+------+--------+----------+----------+
|index| pk1|pk2|  var1|    var2|   dt_part|group_part|
+-----+----+---+------+--------+----------+----------+
|    1|key1|  1|value1|value2_1|2022-12-15|    group2|
|    2|key1|  2|value1|value2_1|2022-12-16|    group2|
|    3|key1|  3|value1|value2_1|2022-12-16|    group3|
|    5|key2|  2|value1|value2_1|2022-12-18|    group2|
|    4|key2|  1|  null|value2_1|2022-12-17|    group1|
|    6|key2|  3|value1|    null|2022-12-20|    group3|
+-----+----+---+------+--------+----------+----------+



In [51]:
df_dedup = deduplicate_df(df, pk=['pk1'], order_by_cols=[col('dt_part').desc(), col('group_part')])

In [52]:
df_dedup.show()

+-----+----+---+------+--------+----------+----------+
|index| pk1|pk2|  var1|    var2|   dt_part|group_part|
+-----+----+---+------+--------+----------+----------+
|    2|key1|  2|value1|value2_1|2022-12-16|    group2|
|    6|key2|  3|value1|    null|2022-12-20|    group3|
+-----+----+---+------+--------+----------+----------+



### Making checkpoints `read_write_table`

In [53]:
spark.sql("create database if not exists test_checkpoint")

DataFrame[]

In [54]:
df_dedup_cp = write_read_table(df_dedup, 'table_name12', schema='test_checkpoint', verbose=1)
df_dedup_cp.count()

DF saved as test_checkpoint.table_name12


2

## Modification of code

1. read as you like, use DFExtender to get stats
2. use all methods from PySpark as usual (beware that PySpark methods return a DataFrame object, not DFExtender object)

Check out official documentation!
1. [pyspark.sql.DataFrame methods](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.html)
2. [PySpark functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html)

In [55]:
df = read_table('default.table1_comp', alias='main')
df_main = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)

In [56]:
print(df_main.__class__)

<class 'hhop.DFExtender'>


In [57]:
# apply PySpark method for DFExtender object
df_main_filter = df_main.filter(col('pk1').isNotNull())

In [58]:
print(df_main_filter.__class__) # the type of an object returns to Spark DataFrame

<class 'pyspark.sql.dataframe.DataFrame'>


### Generating txt files for sending by email

In [59]:
py_files = glob('./hhop/*.py')
os.makedirs('./hhop/txt', exist_ok=True)

for file in py_files:
    filename = file.split('/')[-1]
#     copying
    full_path = f'./hhop/txt/{filename}'
    copy2(file, full_path)
#     renaming
    p = Path(full_path)
    p.rename(p.with_suffix('.txt'))

## Some Tests

Another option would be writing tests inside modules using mocks

### Functions

In [60]:
try: 
    write_table(df, table='table_name1' ,schema='test_hhop', partition_cols=['some_column_not_existed'])
except HhopException as e:
    print('error is catched successfully\n', e, sep='')

error is catched successfully
{'some_column_not_existed'} are not in columns of provided DF


### DFExtender

In [61]:
try: 
    DFExtender(df, pk=['some_column_not_existed'])
except HhopException as e:
    print('error is catched successfully\n', e, sep='')

error is catched successfully
columns {'some_column_not_existed'} are not present in provided columns: ['index', 'pk1', 'pk2', 'var1', 'var2', 'dt_part', 'group_part']


In [62]:
DFExtender(df, pk=df.columns).get_info()

Count all:                6
Unique PK count:          6
PK with duplicates:       0

Null values in columns - {'column': [count NULL, share NULL]}:
{'var1': [1, 0.1667], 'var2': [1, 0.1667]}


In [63]:
df_check1 = DFExtender(df, verbose=True)

In [64]:
df_check1.get_info()


Null values in columns - {'column': [count NULL, share NULL]}:
{'var1': [1, 0.1667], 'var2': [1, 0.1667]}
Use method `.get_df_with_null(List[str])` to get a df with specified NULL columns


In [65]:
df_check1.get_df_with_null([]).show()

No NULL values found in provided [], using all: dict_keys(['var1', 'var2'])
+-----+----+---+------+--------+----------+----------+---------+
|index| pk1|pk2|  var1|    var2|   dt_part|group_part|cnt_nulls|
+-----+----+---+------+--------+----------+----------+---------+
|    4|key2|  1|  null|value2_1|2022-12-17|    group1|        1|
|    6|key2|  3|value1|    null|2022-12-20|    group3|        1|
+-----+----+---+------+--------+----------+----------+---------+



In [66]:
try: 
    df_check1.get_df_with_null(['dt_part_unknown']).show()
except HhopException as e:
    print('error is catched successfully\n', e, sep='')

error is catched successfully
columns {'dt_part_unknown'} are not present in provided columns: ['index', 'pk1', 'pk2', 'var1', 'var2', 'dt_part', 'group_part']


`get_info` method

In [67]:
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info()

Count all:                6
Unique PK count:          6
PK with duplicates:       0

Null values in columns - {'column': [count NULL, share NULL]}:
{'var1': [1, 0.1667], 'var2': [1, 0.1667]}
Use method `.get_df_with_null(List[str])` to get a df with specified NULL columns


In [68]:
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info(pk_stats=False)


Null values in columns - {'column': [count NULL, share NULL]}:
{'var1': [1, 0.1667], 'var2': [1, 0.1667]}
Use method `.get_df_with_null(List[str])` to get a df with specified NULL columns


In [69]:
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info(null_stats=False)

Count all:                6
Unique PK count:          6
PK with duplicates:       0


In [70]:
df_check = DFExtender(df, pk=['pk1', 'pk2'], verbose=True)
df_check.get_info(null_stats=False, pk_stats=False)

Compare tables

In [71]:
df2 = df.drop('pk2').withColumn('pk2', F.lit('unknown_value'))

In [72]:
df_comp1 = DFExtender(df, pk=['pk1', 'pk2'])

In [73]:
df_comp1.compare_tables(df2)

Main DF
Count all:                6
Unique PK count:          6
PK with duplicates:       0

Reference DF
Count all:                6
Unique PK count:          2
PK with duplicates:       2

There are no errors in non PK columns

Count stats of matching main and reference tables:
not in main table:        6
not in reference table:   6
correct matching:         0


In [74]:
df_comp2 = DFExtender(df, pk=df.columns)

In [75]:
df_comp2.compare_tables(df2)

Main DF
Count all:                6
Unique PK count:          6
PK with duplicates:       0

Reference DF
Count all:                6
Unique PK count:          6
PK with duplicates:       0

There are no common columns outside of PK

Count stats of matching main and reference tables:
not in main table:        6
not in reference table:   6
correct matching:         0


In [76]:
dataDictionary1 = {
    'pk': ['key1', 'key2', 'key3', 'key4'],
    'var1': [0.035, 1.1368, 0.41, None],
    'var2': [1,2,3,4]
}

dataDictionary2 = {
    'pk': ['key1', 'key2', 'key3', 'key4'],
    'var1': [0.03, 1.1361, 0.401, None],
    'var2': [1,2,3,4]
}


df1_round = spark.createDataFrame(data=pd.DataFrame(dataDictionary1))
df2_round = spark.createDataFrame(data=pd.DataFrame(dataDictionary2))

In [77]:
df1_round_check = DFExtender(df1_round, pk=['pk'], verbose=True)

In [78]:
df1_round_check.compare_tables(df2_round)

Main DF
Count all:                4
Unique PK count:          4
PK with duplicates:       0

Reference DF
Count all:                4
Unique PK count:          4
PK with duplicates:       0

Errors in columns - {'column': [count is_error, share is_error]}
{'var1': [2, 0.5]}

Count stats of matching main and reference tables:
not in main table:        0
not in reference table:   0
correct matching:         4

Use DF in attribute `.df_with_errors` for further analysis
You can find alternative order of columns in attr .columns_diff_reordered_all


In [79]:
df1_round_check.df_with_errors.filter('var1_is_diff = 1').show()

+----+--------------+-------------+---------+---------+--------+--------+------------+------------+----------+
|  pk|is_joined_main|is_joined_ref|var2_main|var1_main|var2_ref|var1_ref|var2_is_diff|var1_is_diff|sum_errors|
+----+--------------+-------------+---------+---------+--------+--------+------------+------------+----------+
|key3|             1|            1|        3|     0.41|       3|     0.4|           0|           1|         1|
|key1|             1|            1|        1|     0.04|       1|    0.03|           0|           1|         1|
+----+--------------+-------------+---------+---------+--------+--------+------------+------------+----------+



In [80]:
alternative_order_columns = df1_round_check.columns_diff_reordered_all
df1_round_check.df_with_errors[alternative_order_columns].filter('var1_is_diff = 1').show()

+----+--------------+-------------+---------+--------+------------+---------+--------+------------+
|  pk|is_joined_main|is_joined_ref|var2_main|var2_ref|var2_is_diff|var1_main|var1_ref|var1_is_diff|
+----+--------------+-------------+---------+--------+------------+---------+--------+------------+
|key3|             1|            1|        3|       3|           0|     0.41|     0.4|           1|
|key1|             1|            1|        1|       1|           0|     0.04|    0.03|           1|
+----+--------------+-------------+---------+--------+------------+---------+--------+------------+



In [81]:
spark.stop()