In [0]:
pip install pycountry


In [0]:
pip install pycountry_convert

In [0]:
# Includes: country name, ISO codes, continent, timezone, and official language

from pyspark.sql import SparkSession
import pandas as pd
import pycountry
import pytz
from pycountry_convert import (
    country_alpha2_to_continent_code,
    convert_continent_code_to_continent_name
)

# Initialize Spark
spark = SparkSession.builder.appName("CountryDimension").getOrCreate()

# --- Step 1: Country → Language mapping ---
# Source: UN official languages + Wikipedia (simplified main languages)
country_language_map = {
    "US": "English", "IN": "Hindi, English", "PH": "Filipino, English", "MX": "Spanish",
    "BR": "Portuguese", "TR": "Turkish", "MY": "Malay", "CO": "Spanish", "SA": "Arabic",
    "TH": "Thai", "CN": "Mandarin Chinese", "JP": "Japanese", "KR": "Korean", "FR": "French",
    "DE": "German", "ES": "Spanish", "IT": "Italian", "CA": "English, French", "GB": "English",
    "AU": "English", "NZ": "English, Māori", "NG": "English", "ZA": "Zulu, English, Afrikaans",
    "EG": "Arabic", "PK": "Urdu, English", "ID": "Indonesian", "RU": "Russian", "VN": "Vietnamese",
    "IR": "Persian (Farsi)", "AR": "Spanish", "CL": "Spanish", "PE": "Spanish", "NL": "Dutch",
    "SE": "Swedish", "NO": "Norwegian", "DK": "Danish", "FI": "Finnish", "PL": "Polish",
    "UA": "Ukrainian", "BD": "Bengali", "NP": "Nepali", "LK": "Sinhala, Tamil", "SG": "English, Malay, Mandarin, Tamil",
    "AE": "Arabic", "QA": "Arabic", "KW": "Arabic", "OM": "Arabic", "BH": "Arabic",
    "KE": "Swahili, English", "TZ": "Swahili, English", "UG": "English, Swahili", "GH": "English",
    "ET": "Amharic", "SD": "Arabic, English", "DZ": "Arabic", "MA": "Arabic", "TN": "Arabic",
    "IL": "Hebrew, Arabic", "CN": "Mandarin Chinese", "HK": "Chinese, English", "TW": "Mandarin Chinese",
    "KR": "Korean", "LA": "Lao", "KH": "Khmer", "MM": "Burmese", "MN": "Mongolian", "AF": "Dari, Pashto"
}

# --- Step 2: Build complete list of all countries ---
countries_data = []
for c in pycountry.countries:
    alpha2 = c.alpha_2
    alpha3 = getattr(c, "alpha_3", None)
    name = c.name

    # Continent
    try:
        continent_code = country_alpha2_to_continent_code(alpha2)
        continent = convert_continent_code_to_continent_name(continent_code)
    except:
        continent = None

    # Timezone
    try:
        timezone = pytz.country_timezones[alpha2][0]
    except:
        timezone = "UTC"

    # National Language
    language = country_language_map.get(alpha2, "Unknown")

    countries_data.append({
        "country_name": name,
        "country_code": alpha2.lower(),
        "country_alpha3": alpha3,
        "continent": continent,
        "timezone": timezone,
        "language": language
    })

# --- Step 3: Convert to Spark DataFrame ---
df_countries = spark.createDataFrame(pd.DataFrame(countries_data))

# --- Step 4: Preview ---
df_countries.orderBy("country_name").show(20, truncate=False)

# --- Step 5: Save as Delta Table (persistent dimensional table) ---
df_countries.write.format("delta").mode("overwrite").saveAsTable("lingokids.silver.dim_country")
