Problem Statement

Here are tasked with analyzing cricket player statistics from two datasets: one containing player performance data and another containing country codes and names. The goal is to extract and aggregate specific player statistics and present them in a formatted table.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Assuming SparkSession is already created or create it like this
spark = SparkSession.builder.appName("CricketAnalysis").getOrCreate()

# Define schema for players DataFrame
schema = StructType(
    [
        StructField("player", StringType(), True),
        StructField("runs", IntegerType(), True),
        StructField("50s/100s", StringType(), True),
    ]
)

# Create players DataFrame using defined schema and data
data = [
    ("Sachin-IND", 18694, "93/49"),
    ("Ricky-AUS", 11274, "66/31"),
    ("Lara-WI", 10222, "45/21"),
    ("Rahul-IND", 10355, "95/11"),
    ("Jhonty-SA", 7051, "43/5"),
    ("Hayden-AUS", 8722, "67/19"),
]

players_df = spark.createDataFrame(data, schema)

# Define schema for countries DataFrame
schema_countries = StructType(
    [StructField("SRT", StringType(), True), StructField("country", StringType(), True)]
)

# Create countries DataFrame using defined schema and data
data1 = [
    ("IND", "India"),
    ("AUS", "Australia"),
    ("WI", "WestIndies"),
    ("SA", "SouthAfrica"),
]

countries_df = spark.createDataFrame(data1, schema_countries)

In [0]:
countries_df.display()
players_df.display()

SRT,country
IND,India
AUS,Australia
WI,WestIndies
SA,SouthAfrica


player,runs,50s/100s
Sachin-IND,18694,93/49
Ricky-AUS,11274,66/31
Lara-WI,10222,45/21
Rahul-IND,10355,95/11
Jhonty-SA,7051,43/5
Hayden-AUS,8722,67/19


In [0]:
countries_df.createOrReplaceTempView("country")
players_df.createOrReplaceTempView("player")

In [0]:
# Step 1: Split "player" column into "player_name" and "country_code"
players_df = players_df.withColumn(
    "player_name", F.split(F.col("player"), "-").getItem(0)
).withColumn("country_code", F.split(F.col("player"), "-").getItem(1))

# Step 2: Join players_df with countries_df on "country_code" = "SRT"
joined_df = players_df.join(
    countries_df, players_df.country_code == countries_df.SRT, "inner"
).select("player_name", "country", "runs", "50s/100s")

# Step 3: Split the "50s/100s" column into separate 50s and 100s
joined_df = (
    joined_df.withColumn(
        "50s", F.split(F.col("50s/100s"), "/").getItem(0).cast("integer")
    )
    .withColumn("100s", F.split(F.col("50s/100s"), "/").getItem(1).cast("integer"))
    .withColumn("50s_100s_sum", F.col("50s") + F.col("100s"))
)

# Step 4: Filter the relevant players (Ricky, Sachin, Rahul) and select the required columns
final_df = joined_df.filter(
    F.col("player_name").isin("Ricky", "Sachin", "Rahul")
).select("player_name", "country", "runs", "50s_100s_sum")

# Show the final output
final_df.display()

player_name,country,runs,50s_100s_sum
Ricky,Australia,11274,97
Sachin,India,18694,142
Rahul,India,10355,106


In [0]:
# Define and execute SQL query
query = """
    SELECT 
        p.player_name, 
        c.country, 
        p.runs, 
        (CAST(SPLIT(p.`50s/100s`, '/')[0] AS INT) + CAST(SPLIT(p.`50s/100s`, '/')[1] AS INT)) AS 50s_100s_sum
    FROM (
        SELECT 
            SPLIT(player, '-')[0] AS player_name, 
            SPLIT(player, '-')[1] AS country_code, 
            runs, 
            `50s/100s`
        FROM player
    ) p
    JOIN country c
    ON p.country_code = c.SRT
    WHERE p.player_name IN ('Ricky', 'Sachin', 'Rahul')
"""

# Execute the SQL query and get the result
result_df = spark.sql(query)

# Show the result
result_df.display()

player_name,country,runs,50s_100s_sum
Ricky,Australia,11274,97
Sachin,India,18694,142
Rahul,India,10355,106


Explanation:

Register Views:

players_df and countries_df are registered as SQL temporary views named "players" and "countries" respectively.

SQL Query:

Split player column: Extract player_name and country_code from the player column.
Join: Join the players view with the countries view on country_code and SRT.
Calculate 50s_100s_sum: Split the 50s/100s column into 50s and 100s, cast them to integers, and sum them up.
Filter: Include only the players 'Ricky', 'Sachin', and 'Rahul'.
Show Results: Displays the result DataFrame with the desired output format.