Problem Statement:

write an SQL query to report the latest login for all users in the year 2020 and do not include the users who did not login in 2020.

In [0]:
from pyspark.sql.types import *
from pyspark.sql import functions as F
from datetime import datetime

# Define schema for Logins table
logins_schema = StructType(
    [
        StructField("user_id", IntegerType(), True),
        StructField("time_stamp", TimestampType(), True),
    ]
)

# Define data for Logins table with datetime objects
logins_data = [
    (6, datetime(2020, 6, 30, 15, 6, 7)),
    (6, datetime(2021, 4, 21, 14, 6, 6)),
    (6, datetime(2019, 3, 7, 0, 18, 15)),
    (8, datetime(2020, 2, 1, 5, 10, 53)),
    (8, datetime(2020, 12, 30, 0, 46, 50)),
    (2, datetime(2020, 1, 16, 2, 49, 50)),
    (2, datetime(2019, 8, 25, 7, 59, 8)),
    (14, datetime(2019, 7, 14, 9, 0, 0)),
    (14, datetime(2021, 1, 6, 11, 59, 59)),
]

# Create Logins DataFrame
logins_df = spark.createDataFrame(logins_data, schema=logins_schema)

# Show the data in DataFrames
print("Logins Table:")
logins_df.display()

Logins Table:


user_id,time_stamp
6,2020-06-30T15:06:07.000+0000
6,2021-04-21T14:06:06.000+0000
6,2019-03-07T00:18:15.000+0000
8,2020-02-01T05:10:53.000+0000
8,2020-12-30T00:46:50.000+0000
2,2020-01-16T02:49:50.000+0000
2,2019-08-25T07:59:08.000+0000
14,2019-07-14T09:00:00.000+0000
14,2021-01-06T11:59:59.000+0000


In [0]:
logins_df.createOrReplaceTempView("logins")

In [0]:
%sql
seleCT
  USER_id,
  cast(substr(max(time_stamp), 0, 10) as date) as latest_login
from
  logins
where
  year(time_stamp) = 2020
group by
  user_id

USER_id,latest_login
6,2020-06-30
8,2020-12-30
2,2020-01-16


In [0]:
from pyspark.sql import functions as F

# Filter for rows where the year in time_stamp is 2020
logins_filtered = logins_df.filter(F.year("time_stamp") == 2020)

# Group by user_id and calculate the latest time_stamp
result_df = (
    logins_filtered.groupBy("user_id")
    .agg(F.max("time_stamp").alias("latest_time_stamp"))
    .withColumn("latest_login", F.date_format(F.col("latest_time_stamp"), "yyyy-MM-dd"))
)
final_df = result_df.select("user_id", "latest_login")
# Show the result
final_df.display()

user_id,latest_login
6,2020-06-30
8,2020-12-30
2,2020-01-16


Explanation:

Filter Rows:

Use filter(F.year("time_stamp") == 2020) to keep only the rows where the year in the time_stamp column is 2020.
Group By:

Group the DataFrame by user_id using .groupBy("user_id").
Aggregate:

Calculate the maximum time_stamp for each user_id using F.max("time_stamp").alias("latest_time_stamp").
Extract Date:

Use F.date_format(F.col("latest_time_stamp"), "yyyy-MM-dd") to extract the date part from the TimestampType column and format it as yyyy-MM-dd.
Select Required Columns:

Select only the user_id and the latest_login columns to match the desired output.