In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col
import pandas as pd
import numpy as np
import math

In [2]:
MAX_MEMORY = '3G'

# Initialize a spark session.
conf = pyspark.SparkConf().setMaster('local[2]')\
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set("spark.core.connection.ack.wait.timeout", "3600") \
        .set("spark.executor.memory", MAX_MEMORY) \
        .set("spark.driver.memory", MAX_MEMORY)
                                     
# Firstly we create sparkSession (like a container)
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Pyspark EDA") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

spark = init_spark()

In [3]:
input_data = [("Shivansh", "Data Scientist", "Noida"),
              (None, "Software Developer", None),
              ("Swati", "Data Analyst", "Hyderabad"),
              (None, None, "Noida"),
              ("Arpit", "Android Developer", "Banglore"),
              (None, None, None)]

schema = ["Name", "Job Profile", "City"]

# calling function to create dataframe
df = spark.createDataFrame(input_data, schema)

df.show()

+--------+------------------+---------+
|    Name|       Job Profile|     City|
+--------+------------------+---------+
|Shivansh|    Data Scientist|    Noida|
|    null|Software Developer|     null|
|   Swati|      Data Analyst|Hyderabad|
|    null|              null|    Noida|
|   Arpit| Android Developer| Banglore|
|    null|              null|     null|
+--------+------------------+---------+



In [4]:
def row_col_miss_func(df):
    
    from pyspark.sql.functions import format_number, lit, concat, count
    from pyspark.sql.functions import when, isnan
    pd.set_option('max_columns', None)
    
    print('========================================================')
    print('======================SAMPLE OF DATAFRAME===============')
    print('========================================================')
    
    sample= df.limit(2).toPandas()
    print(sample)
    
    print()
    print()
    print()
    
    print('========================================================')
    print('======================SCHEMA OF DATAFRAME===============')
    print('========================================================')    
    
    df.printSchema()
    
    print()
    print()
    print()    
    
    col_cnt = len(df.columns)


    missing_val = df.withColumn('miss_cnt', sum(\
            when((col(column_name) == "" ) | (col(column_name).isNull()) | (isnan(col(column_name))),1)\
                 .otherwise(0) for column_name in df.columns))

    missing_percent_df = missing_val.withColumn('miss_percent',concat(format_number(((col('miss_cnt')/col_cnt) * 100),2),lit('%')))
    
    print('========================================================')
    print('==============MISSING % OF DATAFRAME=====================')
    print('========================================================')   
    
    missing_percent_df.show()
    
    print()
    print()
    print()     
        
    
    missing_percent_df.select('miss_cnt','miss_percent').filter(col('miss_cnt') > 3)

    missing_percent_col = df.select([ \
                                count( \
                                  when((col(c) == "" ) | \
                                    (col(c).isNull()) | \
                                    (isnan(col(c))), c)).alias(c) \
                                    for c in df.columns])

    #missing_percent_col.toPandas()
    
    print('========================================================')
    print('==============MISSING IN COLUMN OF DATAFRAME============')
    print('========================================================')   
    
    print(missing_percent_col.show())
    
    print()
    print()
    print()     
    
    print('========================================================')
    print('======================SUMMARY OF DATAFRAME============')
    print('========================================================')       
    
    summary = df.describe().toPandas()
    print(summary)
    
    print()
    print()
    print()     

In [5]:
row_col_miss_func(df)

       Name         Job Profile   City
0  Shivansh      Data Scientist  Noida
1      None  Software Developer   None



root
 |-- Name: string (nullable = true)
 |-- Job Profile: string (nullable = true)
 |-- City: string (nullable = true)




+--------+------------------+---------+--------+------------+
|    Name|       Job Profile|     City|miss_cnt|miss_percent|
+--------+------------------+---------+--------+------------+
|Shivansh|    Data Scientist|    Noida|       0|       0.00%|
|    null|Software Developer|     null|       2|      66.67%|
|   Swati|      Data Analyst|Hyderabad|       0|       0.00%|
|    null|              null|    Noida|       2|      66.67%|
|   Arpit| Android Developer| Banglore|       0|       0.00%|
|    null|              null|     null|       3|     100.00%|
+--------+------------------+---------+--------+------------+




+----+-----------+----+
|Name|Job Profile|City|
+----+-----------+----+
|   3|          2|   2|
+----+-----------+----+

None



  s