# Ex-2040 - Columns and Expressions


In [1]:
# Import SparkSession and required types
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize Spark session
spark = SparkSession.builder.appName("LargestCities").getOrCreate()

# Define schema
schema = StructType([
    StructField("City", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("Area_km2", IntegerType(), True),
    StructField("Population", IntegerType(), True),
    StructField("Founded_Year", IntegerType(), True)
])

# Define data, using negative years for BCE and approximations for century-based dates
data = [
    ("Chongqing", "China", 82403, 18171200, -316),  # BCE
    ("Beijing", "China", 16411, 22596500, -1045),  # BCE
    ("Brisbane", "Australia", 15842, 2560000, 1824),
    ("Tokyo", "Japan", 8230, 37036200, 1457),
    ("Shanghai", "China", 6340, 30482100, -960),  # BCE
    ("Warsaw", "Poland", 517, 1702139, 1250),  # Approx. mid-13th century
    ("Krakow", "Poland", 327, 755050, 750),  # Approx. mid-8th century
    ("Łódź", "Poland", 293, 768755, 1350),  # Approx. mid-14th century
    ("Wroclaw", "Poland", 293, 634893, 950),  # Approx. mid-10th century
    ("Poznań", "Poland", 262, 570352, 850)  # Approx. mid-9th century
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show DataFrame
df.show()

+---------+---------+--------+----------+------------+
|     City|  Country|Area_km2|Population|Founded_Year|
+---------+---------+--------+----------+------------+
|Chongqing|    China|   82403|  18171200|        -316|
|  Beijing|    China|   16411|  22596500|       -1045|
| Brisbane|Australia|   15842|   2560000|        1824|
|    Tokyo|    Japan|    8230|  37036200|        1457|
| Shanghai|    China|    6340|  30482100|        -960|
|   Warsaw|   Poland|     517|   1702139|        1250|
|   Krakow|   Poland|     327|    755050|         750|
|     Łódź|   Poland|     293|    768755|        1350|
|  Wroclaw|   Poland|     293|    634893|         950|
|   Poznań|   Poland|     262|    570352|         850|
+---------+---------+--------+----------+------------+



In [2]:
df.printSchema()

root
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Area_km2: integer (nullable = true)
 |-- Population: integer (nullable = true)
 |-- Founded_Year: integer (nullable = true)



In [3]:
from pyspark.sql.functions import expr, upper, concat, substring, col
from datetime import datetime

current_year = datetime.now().year

df = df.withColumn("City_Age", expr(f"{current_year} - Founded_Year")) \
       .withColumn("City_Size_Label", expr(
           "CASE WHEN Population > 1000000 THEN 'GIGA-CITY' "
           "WHEN Population > 1000 THEN 'MEGA-CITY' "
           "ELSE 'KILO-CITY' END")) \
       .withColumn("City_Uppercase", upper(col("City"))) \
       .withColumn("Area_per_Person_m2", expr("(CAST(Area_km2 AS DOUBLE) * 1000000.0) / Population")) \
       .withColumn("CountryCity_Code", concat(upper(substring(col("Country"), 1, 3)), upper(substring(col("City"), 1, 3))))

# Show updated DataFrame
df.show()


+---------+---------+--------+----------+------------+--------+---------------+--------------+------------------+----------------+
|     City|  Country|Area_km2|Population|Founded_Year|City_Age|City_Size_Label|City_Uppercase|Area_per_Person_m2|CountryCity_Code|
+---------+---------+--------+----------+------------+--------+---------------+--------------+------------------+----------------+
|Chongqing|    China|   82403|  18171200|        -316|    2341|      GIGA-CITY|     CHONGQING| 4534.813330985296|          CHICHO|
|  Beijing|    China|   16411|  22596500|       -1045|    3070|      GIGA-CITY|       BEIJING| 726.2629168234018|          CHIBEI|
| Brisbane|Australia|   15842|   2560000|        1824|     201|      GIGA-CITY|      BRISBANE|        6188.28125|          AUSBRI|
|    Tokyo|    Japan|    8230|  37036200|        1457|     568|      GIGA-CITY|         TOKYO|222.21502205949855|          JAPTOK|
| Shanghai|    China|    6340|  30482100|        -960|    2985|      GIGA-CITY|    

In [4]:
df = df.selectExpr(
    "City",
    "Country",
    "Area_km2",
    "Population",
    "Founded_Year",
    f"{datetime.now().year} - Founded_Year AS City_Age",
    "CASE WHEN Population > 1000000 THEN 'GIGA-CITY' "
    "WHEN Population > 1000 THEN 'MEGA-CITY' ELSE 'KILO-CITY' END AS City_Size_Label",
    "UPPER(City) AS City_Uppercase",
    "CAST(Area_km2 AS DOUBLE) * 1000000.0 / Population AS Area_per_Person_m2",
    "CONCAT(UPPER(SUBSTRING(Country, 1, 3)), UPPER(SUBSTRING(City, 1, 3))) AS CountryCity_Code"
)

df.show()


+---------+---------+--------+----------+------------+--------+---------------+--------------+------------------+----------------+
|     City|  Country|Area_km2|Population|Founded_Year|City_Age|City_Size_Label|City_Uppercase|Area_per_Person_m2|CountryCity_Code|
+---------+---------+--------+----------+------------+--------+---------------+--------------+------------------+----------------+
|Chongqing|    China|   82403|  18171200|        -316|    2341|      GIGA-CITY|     CHONGQING| 4534.813330985296|          CHICHO|
|  Beijing|    China|   16411|  22596500|       -1045|    3070|      GIGA-CITY|       BEIJING| 726.2629168234018|          CHIBEI|
| Brisbane|Australia|   15842|   2560000|        1824|     201|      GIGA-CITY|      BRISBANE|        6188.28125|          AUSBRI|
|    Tokyo|    Japan|    8230|  37036200|        1457|     568|      GIGA-CITY|         TOKYO|222.21502205949855|          JAPTOK|
| Shanghai|    China|    6340|  30482100|        -960|    2985|      GIGA-CITY|    