In [1]:
# Remove any previous installs to avoid collisions
!pip -q uninstall -y pyspark findspark >/dev/null

# Ensure Java 17 and set JAVA_HOME for this session
!apt-get -qq update && apt-get -qq install -y openjdk-17-jdk

import os, subprocess, sys
JAVA_HOME_CANDIDATES = [
    "/usr/lib/jvm/java-17-openjdk-amd64",
    "/usr/lib/jvm/java-17-openjdk-arm64",
    "/usr/lib/jvm/java-17-openjdk"
]
JAVA_HOME = next((p for p in JAVA_HOME_CANDIDATES if os.path.exists(p)), "")
os.environ["JAVA_HOME"] = JAVA_HOME
os.environ["PATH"] = f"{JAVA_HOME}/bin:" + os.environ["PATH"]

# Install PySpark compatible with Java 17 and Colab's Python
!pip -q install pyspark==3.5.1

# Sanity checks (useful if it still fails)
import pyspark, py4j
print("Python:", sys.version)
print("JAVA_HOME:", os.environ.get("JAVA_HOME"))
print("java -version:")
print(subprocess.run(["bash","-lc","java -version"], capture_output=True, text=True).stderr)
print("pyspark:", pyspark.__version__, "py4j:", py4j.__version__)


[0mW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package java-common.
(Reading database ... 125082 files and directories currently installed.)
Preparing to unpack .../00-java-common_0.72build2_all.deb ...
Unpacking java-common (0.72build2) ...
Selecting previously unselected package libpcsclite1:amd64.
Preparing to unpack .../01-libpcsclite1_1.9.5-3ubuntu1_amd64.deb ...
Unpacking libpcsclite1:amd64 (1.9.5-3ubuntu1) ...
Selecting previously unselected package openjdk-17-jre-headless:amd64.
Preparing to unpack .../02-openjdk-17-jre-headless_17.0.16+8~us1-0ubuntu1~22.04.1_amd64.deb ...
Unpacking openjdk-17-jre-headless:amd64 (17.0.16+8~us1-0ubuntu1~22.04.1) ...
Selecting previously unselected package ca-certificates-java.
Preparing to unpack .../03-ca-certificates-java_20190909ubuntu1.2_all.deb ...
Unpacking ca-cert

In [2]:
from pyspark.sql import SparkSession
import os

os.makedirs("/content/spark-tmp", exist_ok=True)

spark = (
    SparkSession.builder
    .appName("colab")
    .master("local[*]")
    .config("spark.ui.enabled", "false")      # UI not needed in Colab
    .config("spark.driver.memory", "2g")      # avoid OOM-killed JVM
    .config("spark.local.dir", "/content/spark-tmp")
    .getOrCreate()
)

print("Spark version:", spark.version)
spark.range(5).show()


Spark version: 3.5.1
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [3]:
df = spark.read.csv('covidmex.csv', header=True)
df.show(5)

+-----------+-----------+-------------+-------------+--------------+------+-----+------------------+-------------------+------------------+--------------------+---------+------------------+
|ID_REGISTRO|ENTIDAD_RES|MUNICIPIO_RES|FECHA_INGRESO|FECHA_SINTOMAS|covidt|delta|               lat|               long|               alt|                 qry|dayofyear|       lengthofday|
+-----------+-----------+-------------+-------------+--------------+------+-----+------------------+-------------------+------------------+--------------------+---------+------------------+
|     z526b3|          9|           12|   2020-12-21|    2020-12-18|     1|    3| 19.20155345588235| -99.20180252450989|3008.9460784313724|Cve_Ent==9 & Cve_...|      356|11.768371962176587|
|     z3d1e2|          9|            5|   2020-04-22|    2020-04-20|     1|    2|         19.482945|         -99.113471|            2229.0|Cve_Ent==9 & Cve_...|      113|13.441805501598786|
|     z21f6f|          7|            9|   2020-04-