# Apache Spark en Google Colab
Ejercicios de WordCount, DataFrame API y MLlib (clasificación)

In [None]:
#configuración en google colab de spark y pyspark
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.5.5/spark-3.5.5-bin-hadoop3.tgz
!tar xf spark-3.5.5-bin-hadoop3.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.5-bin-hadoop3"
import findspark
findspark.init()

## Ejemplo 1: WordCount con RDD

In [None]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

text = sc.textFile("gdrive/MyDrive/st1800-251/datasets/gutenberg-en/*.txt")
# Simular archivo de texto
# text = sc.parallelize(["Hola Spark Hola Big Data", "Spark es rápido y poderoso"])
counts = text.flatMap(lambda x: x.split(" ")) \
             .map(lambda x: (x, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts.collect()

## Ejemplo 2: Análisis con DataFrame API

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Simular DataFrame de ventas
data = [("martillo", 12000), ("taladro", 45000), ("martillo", 15000)]
columns = ["producto", "valor"]
df = spark.createDataFrame(data, columns)
df.groupBy("producto").sum("valor").show()

## Ejemplo 3: Clasificación con MLlib

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

df = spark.read.csv("gdrive/MyDrive/st1800-251/datasets/clientes.csv", header=True, inferSchema=True)

assembler = VectorAssembler(inputCols=["edad", "ingresos"], outputCol="features")
data = assembler.transform(df).select("features", df["comprador"].alias("label"))
train, test = data.randomSplit([0.8, 0.2], seed=42)
lr = LogisticRegression()
model = lr.fit(train)
model.transform(test).select("features", "label", "prediction").show()