In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
from pyspark.sql.functions import col, when, count, lit
spark = SparkSession.builder \
    .appName("IPLDataIngestion") \
    .getOrCreate()
string_schema = """
    match_id INT,
    season STRING,
    start_date DATE,
    venue STRING,
    innings INT,
    ball FLOAT,
    batting_team STRING,
    bowling_team STRING,
    striker STRING,
    non_striker STRING,
    bowler STRING,
    runs_off_bat INT,
    extras INT,
    wides INT,
    noballs INT,
    byes INT,
    legbyes INT,
    penalty INT,
    wicket_type STRING,
    player_dismissed STRING,
    other_wicket_type STRING,
    other_player_dismissed STRING
"""
ipl_df_string_schema = spark.read \
    .option("header", "true") \
    .schema(string_schema) \
    .csv("path_to_ipl_data.csv")  
struct_schema = StructType([
    StructField("match_id", IntegerType(), True),
    StructField("season", StringType(), True),
    StructField("start_date", DateType(), True),
    StructField("venue", StringType(), True),
    StructField("innings", IntegerType(), True),
    StructField("ball", FloatType(), True),
    StructField("batting_team", StringType(), True),
    StructField("bowling_team", StringType(), True),
    StructField("striker", StringType(), True),
    StructField("non_striker", StringType(), True),
    StructField("bowler", StringType(), True),
    StructField("runs_off_bat", IntegerType(), True),
    StructField("extras", IntegerType(), True),
    StructField("wides", IntegerType(), True),
    StructField("noballs", IntegerType(), True),
    StructField("byes", IntegerType(), True),
    StructField("legbyes", IntegerType(), True),
    StructField("penalty", IntegerType(), True),
    StructField("wicket_type", StringType(), True),
    StructField("player_dismissed", StringType(), True),
    StructField("other_wicket_type", StringType(), True),
    StructField("other_player_dismissed", StringType(), True)
])
ipl_df_struct_schema = spark.read \
    .option("header", "true") \
    .schema(struct_schema) \
    .csv("path_to_ipl_data.csv") 
quality_checked_df = ipl_df_struct_schema.withColumn("is_valid", 
    when(col("match_id").isNull(), lit(False)) \
    .when(col("season").isNull(), lit(False)) \
    .when(col("start_date").isNull(), lit(False)) \
    .when(col("innings").isin(1, 2) == False, lit(False)) \
    .otherwise(lit(True))
)

valid_records = quality_checked_df.filter(col("is_valid") == True)
invalid_records = quality_checked_df.filter(col("is_valid") == False)
