<a href="https://colab.research.google.com/github/romicabrera/Data-science/blob/main/Sesi%C3%B3n%20_Spark1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Análisis de preferencias de teléfonos en Sudamérica con PySpark
# Requiere: pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import trim


import sys
import os
os.environ["PYSPARK_PYTHON"] = sys.executable

# 1. Crear SparkSession
spark = SparkSession.builder.appName("AnalisisTelefonos").getOrCreate()


# 2. Cargar el archivo CSV
# The original file 'telefonos_sudamerica.csv' was not found.
# Replacing with a sample file for demonstration.
# Please replace with the correct path to your data.
file_path = '/content/sample_data/california_housing_train.csv'
df_raw = spark.read.csv(file_path, header=True, inferSchema=True)
df = df_raw

# --- tipos correctos y limpia nulos antes del UDF ---
# Assuming the sample data has 'median_house_value' which we can cast to float
df = df.withColumn('median_house_value', col('median_house_value').cast('float'))
# Assuming the sample data has 'housing_median_age' which we can cast to IntegerType
df = df.withColumn('housing_median_age', col('housing_median_age').cast(IntegerType()))


# 3. Exploración inicial
df.show(10)
df.printSchema()
for c in df.columns:
    nulls = df.filter(col(c).isNull()).count()
    print(f"Columna {c}: {nulls} valores nulos")

# 4. Filtrado de datos
# Example filtering based on sample data - median_house_value
filtered_df = df.filter(col('median_house_value') > 300000)
filtered_df.show()


# 5. Agrupamiento y agregación
# Promedio of median_house_value by housing_median_age (example)
promedio_precio_pais = df.groupBy('housing_median_age').agg(avg('median_house_value').alias('promedio_median_house_value'))
promedio_precio_pais.show()
# Number of records by housing_median_age (example)
ventas_por_marca = df.groupBy('housing_median_age').agg(count('*').alias('cantidad_registros'))
ventas_por_marca.show()

# 6. Análisis por rango de edad (usando SQL nativo - sin UDF)
# Example using housing_median_age as a proxy for age range
df_edad = df.filter(col('housing_median_age').isNotNull() & (col('housing_median_age') > 0))

df_edad = df_edad.withColumn('rango_edad',
    when(col('housing_median_age') < 20, '0-19')
    .when(col('housing_median_age') < 40, '20-39')
    .otherwise('40+')
)

df_edad.groupBy('rango_edad').agg(avg('median_house_value').alias('precio_promedio')).orderBy('rango_edad').show()

# 7. Análisis de correlación
# Correlacion between 'total_rooms' and 'median_house_value' (example)
correlacion = df.stat.corr('total_rooms', 'median_house_value')
print(f"Correlación entre total_rooms y median_house_value: {correlacion}")

# 8. Filtrado por características del teléfono
# Example filtering based on total_rooms and population
filtro = df.filter((col('total_rooms') > 1000) & (col('population') > 500))
conteo = filtro.count()
print(f"Registros con total_rooms > 1000 and population > 500: {conteo}")


# 9. Guardar resultados
# convertir a Pandas y guardar
filtro_pandas = filtro.toPandas()
filtro_pandas.to_csv('resultados_analisis_pandas.csv', index=False)
print('Archivo guardado como resultados_analisis_pandas.csv usando Pandas.')

spark.stop()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -114.31|   34.19|                15|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|
|  -114.47|    34.4|                19|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|
|  -114.56|   33.69|                17|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|
|  -114.57|   33.64|                14|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|
|  -114.57|   33.57|                20|     1454.0|         326.0|     624.0|     262.0|        1.925|           65500.0|
|  -114.58|   33.63|    