In [11]:
# set up the Spark Session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySpark Data Audit example")\
                    .config("spark.some.config.option", "some-value").getOrCreate()

# load test dataset 
data = spark.read.csv(path='Heart.csv',
                      sep=',', encoding='UTF-8', comment=None, 
                      header=True, inferSchema=True)
data = data.sample(withReplacement=False,fraction= 0.5)

In [13]:
data.show(5)

+---+---+------------+------+----+---+-------+-----+-----+-------+-----+---+----------+---+
|Age|Sex|   ChestPain|RestBP|Chol|Fbs|RestECG|MaxHR|ExAng|Oldpeak|Slope| Ca|      Thal|AHD|
+---+---+------------+------+----+---+-------+-----+-----+-------+-----+---+----------+---+
| 67|  1|asymptomatic|   160| 286|  0|      2|  108|    1|    1.5|    2|  3|    normal|Yes|
| 67|  1|asymptomatic|   120| 229|  0|      2|  129|    1|    2.6|    2|  2|reversable|Yes|
| 37|  1|  nonanginal|   130| 250|  0|      0|  187|    0|    3.5|    3|  0|    normal| No|
| 56|  1|  nontypical|   120| 236|  0|      0|  178|    0|    0.8|    1|  0|    normal| No|
| 63|  1|asymptomatic|   130| 254|  0|      2|  147|    0|    1.4|    2|  1|reversable|Yes|
+---+---+------------+------+----+---+-------+-----+-----+-------+-----+---+----------+---+
only showing top 5 rows



In [58]:
test = spark.createDataFrame([
                        ('Joe', 67, 'F', 70000, 'asymptomatic', 286.1),
                        ('Henry', 67, 'M', 80000, 'asymptomatic', 229.2),
                        ('Sam', 37,  'F', 60000, 'nonanginal', 250.3),
                        ('Max', 56, 'M', 90000, 'nontypical', 236.4),
                        ('Mat', 56, 'F', 90000, 'asymptomatic', 254.5)],
                        ['Name', 'Age', 'Sex', 'Sallary', 'ChestPain', 'Chol']
                       )

In [59]:
#from pyspark.sql.functions import to_timestamp
test = spark.createDataFrame([
                        ('Joe', 67, 'F', 70000, 'asymptomatic', 286.1, '2019-6-28'),
                        ('Henry', 67, 'M', 80000, 'asymptomatic', 229.2, '2019-6-29'),
                        ('Sam', 37,  'F', 60000, 'nonanginal', 250.3, '2019-6-30'),
                        ('Max', 56, 'M', 90000, 'nontypical', 236.4, '2019-5-28'),
                        ('Mat', 56, 'F', 90000, 'asymptomatic', 254.5, '2019-4-28')],
                        ['Name', 'Age', 'Sex', 'Sallary', 'ChestPain', 'Chol', 'CreatDate']
                       )

import pyspark.sql.functions as F
test = test.withColumn('CreatDate', F.col('CreatDate').cast('timestamp'))

In [60]:
test.show()

+-----+---+---+-------+------------+-----+--------------------+
| Name|Age|Sex|Sallary|   ChestPain| Chol|           CreatDate|
+-----+---+---+-------+------------+-----+--------------------+
|  Joe| 67|  F|  70000|asymptomatic|286.1|2019-06-28 00:00:...|
|Henry| 67|  M|  80000|asymptomatic|229.2|2019-06-29 00:00:...|
|  Sam| 37|  F|  60000|  nonanginal|250.3|2019-06-30 00:00:...|
|  Max| 56|  M|  90000|  nontypical|236.4|2019-05-28 00:00:...|
|  Mat| 56|  F|  90000|asymptomatic|254.5|2019-04-28 00:00:...|
+-----+---+---+-------+------------+-----+--------------------+



In [61]:
# import PySparkAudit function
from PySparkAudit import data_types
data_types(test)

Unnamed: 0,feature,dtypes
0,Name,string
1,Age,bigint
2,Sex,string
3,Sallary,bigint
4,ChestPain,string
5,Chol,double
6,CreatDate,timestamp


In [64]:
test = spark.createDataFrame([
                        ('Joe', None, 'F', 70000, 'asymptomatic', 286.1, '2019-6-28'),
                        ('Henry', 67, 'M', 80000, 'asymptomatic', 229.2, '2019-6-29'),
                        ('Sam', 37,  'F', 60000, 'nonanginal', 250.3, '2019-6-30'),
                        ('Max', 56, '  ', 90000, None, 236.4, '2019-5-28'),
                        ('Mat', 56, 'F', None, 'asymptomatic', 254.5, '2019-4-28')],
                        ['Name', 'Age', 'Sex', 'Sallary', 'ChestPain', 'Chol', 'CreatDate']
                       )


In [65]:
from PySparkAudit import counts
counts(test)

Unnamed: 0,feature,row_count,notnull_count,distinct_count
0,Name,5,5,5
1,Age,5,4,3
2,Sex,5,5,3
3,Sallary,5,4,4
4,ChestPain,5,4,2
5,Chol,5,5,5
6,CreatDate,5,5,5
