In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Temp_dataset") \
    .getOrCreate()

In [2]:
# Load dataset
data_path = "temp.csv"  # Replace with actual path
rdd = spark.sparkContext.textFile(data_path)

In [3]:
# Extract the header
header = rdd.first()
data_rdd = rdd.filter(lambda row: row != header)  # Remove the header

In [4]:
# Split each row by comma and convert to (StationID, ItemID, desc, temp) format
data_rdd = data_rdd.map(lambda line: line.split(",")) \
                   .map(lambda cols: (cols[0], cols[1], cols[2], float(cols[3])))  # assuming temp is in column 4

In [5]:
# 1. Filter for "TMIN" and find minimum temperatures
tmin_rdd = data_rdd.filter(lambda x: x[2] == "TMIN")

# a. Overall minimum temperature
overall_min_tmin = tmin_rdd.map(lambda x: x[3]).min()

# b. Minimum temperature for each ItemID
min_temp_by_item = tmin_rdd.map(lambda x: (x[1], x[3])) \
                           .reduceByKey(lambda a, b: min(a, b))

# c. Minimum temperature for each StationID
min_temp_by_station = tmin_rdd.map(lambda x: (x[0], x[3])) \
                              .reduceByKey(lambda a, b: min(a, b))

In [6]:
# Display results for TMIN
print(f"Overall minimum temperature (TMIN): {overall_min_tmin}")
print("Minimum temperature for each ItemID (TMIN):")
for item, min_temp in min_temp_by_item.collect():
    print(f"ItemID: {item}, Min Temp: {min_temp}")

print("Minimum temperature for each StationID (TMIN):")
for station, min_temp in min_temp_by_station.collect():
    print(f"StationID: {station}, Min Temp: {min_temp}")

Overall minimum temperature (TMIN): -148.0
Minimum temperature for each ItemID (TMIN):
ItemID: 18000102, Min Temp: -130.0
ItemID: 18000104, Min Temp: -74.0
ItemID: 18000106, Min Temp: -57.0
ItemID: 18000110, Min Temp: -75.0
ItemID: 18000111, Min Temp: -62.0
ItemID: 18000112, Min Temp: -60.0
ItemID: 18000114, Min Temp: -35.0
ItemID: 18000115, Min Temp: -23.0
ItemID: 18000116, Min Temp: -37.0
ItemID: 18000117, Min Temp: -35.0
ItemID: 18000118, Min Temp: 9.0
ItemID: 18000122, Min Temp: -16.0
ItemID: 18000124, Min Temp: -3.0
ItemID: 18000126, Min Temp: 16.0
ItemID: 18000127, Min Temp: 15.0
ItemID: 18000128, Min Temp: 33.0
ItemID: 18000130, Min Temp: 3.0
ItemID: 18000202, Min Temp: 19.0
ItemID: 18000205, Min Temp: 22.0
ItemID: 18000207, Min Temp: -22.0
ItemID: 18000208, Min Temp: -39.0
ItemID: 18000210, Min Temp: -69.0
ItemID: 18000211, Min Temp: -102.0
ItemID: 18000212, Min Temp: -78.0
ItemID: 18000213, Min Temp: -42.0
ItemID: 18000217, Min Temp: -13.0
ItemID: 18000218, Min Temp: -52.0
Ite

In [7]:
# 2. Filter for "TMAX" and find maximum temperatures
tmax_rdd = data_rdd.filter(lambda x: x[2] == "TMAX")

# a. Overall maximum temperature
overall_max_tmax = tmax_rdd.map(lambda x: x[3]).max()

# b. Maximum temperature for each ItemID
max_temp_by_item = tmax_rdd.map(lambda x: (x[1], x[3])) \
                           .reduceByKey(lambda a, b: max(a, b))

# c. Maximum temperature for each StationID
max_temp_by_station = tmax_rdd.map(lambda x: (x[0], x[3])) \
                              .reduceByKey(lambda a, b: max(a, b))


In [8]:
# Display results for TMAX
print(f"Overall maximum temperature (TMAX): {overall_max_tmax}")
print("Maximum temperature for each ItemID (TMAX):")
for item, max_temp in max_temp_by_item.collect():
    print(f"ItemID: {item}, Max Temp: {max_temp}")

print("Maximum temperature for each StationID (TMAX):")
for station, max_temp in max_temp_by_station.collect():
    print(f"StationID: {station}, Max Temp: {max_temp}")

Overall maximum temperature (TMAX): 323.0
Maximum temperature for each ItemID (TMAX):
ItemID: 18000102, Max Temp: -44.0
ItemID: 18000104, Max Temp: 0.0
ItemID: 18000106, Max Temp: 13.0
ItemID: 18000110, Max Temp: 46.0
ItemID: 18000111, Max Temp: 66.0
ItemID: 18000112, Max Temp: 41.0
ItemID: 18000114, Max Temp: 41.0
ItemID: 18000115, Max Temp: 54.0
ItemID: 18000116, Max Temp: 56.0
ItemID: 18000117, Max Temp: 84.0
ItemID: 18000118, Max Temp: 59.0
ItemID: 18000122, Max Temp: 81.0
ItemID: 18000124, Max Temp: 85.0
ItemID: 18000126, Max Temp: 75.0
ItemID: 18000127, Max Temp: 73.0
ItemID: 18000128, Max Temp: 79.0
ItemID: 18000130, Max Temp: 66.0
ItemID: 18000202, Max Temp: 67.0
ItemID: 18000205, Max Temp: 79.0
ItemID: 18000207, Max Temp: 73.0
ItemID: 18000208, Max Temp: 50.0
ItemID: 18000210, Max Temp: 38.0
ItemID: 18000211, Max Temp: 21.0
ItemID: 18000212, Max Temp: 13.0
ItemID: 18000213, Max Temp: 13.0
ItemID: 18000217, Max Temp: 71.0
ItemID: 18000218, Max Temp: 48.0
ItemID: 18000220, Max T