In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col,isnan, when, count
from pyspark.sql.window import Window
from pyspark.sql.types import DateType
from datetime import datetime, timedelta
import sys


# Configuración detallada para Spark
conf = SparkConf().setAppName("ClusterConfigExample") 

# Iniciar el SparkContext
sc = SparkContext(conf=conf)

# Crear la SparkSession
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [2]:
df = spark.read.csv('madrid_2020_2022.csv', header=True, inferSchema=True)
df.printSchema()
print(df.count())

root
 |-- _c0: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- relative_humidity_2m: double (nullable = true)
 |-- dew_point_2m: double (nullable = true)
 |-- apparent_temperature: double (nullable = true)
 |-- precipitation_probability: string (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- wind_speed_10m: double (nullable = true)
 |-- temperature_80m: string (nullable = true)
 |-- soil_temperature_0cm: string (nullable = true)

17568


### Exploratorio
- Eliminar las columnas que no se vayan a utilizar.
- Cambiar el tipo de dato.
- Tranformar las columnas pertinentes.

In [3]:
# Columnas del dataframe: todas son strings, hay que cambiar el tipo de dato a fecha y numericos
# 1. Elimino dichas columnas por ser innecesarias
# 2. Cambio el tipo de dato y remplazo ',' por '.'
# 3. La columna fecha la convierto a date
columnas = ["_c0","precipitation_probability","temperature_80m","soil_temperature_0cm","rain","dew_point_2m","apparent_temperature"]
df = df.drop(*columnas)
for columna in df.columns:
    if columna in ["temperature_2m","relative_humidity_2m","precipitation","wind_speed_10m"]:
        df = df.withColumn(columna, F.regexp_replace(F.col(columna), ",", ".").cast("double"))
    elif columna == "date":
        df = df.withColumn("date", F.regexp_replace(F.col("date"), '\\+', ""))
        df = df.withColumn("date", F.to_timestamp(F.col("date"), "yyyy-MM-dd HH:mm:ss"))

- Columnas finales:
    - **Date**: fecha.
    - **Temperature_2m**: temperratura a 2 metros del suelo.
    - **Relative_humidity_2m**: humedad relativa a 2 metros del suelo.
    - **Precipitation**: precipitación.
    - **Wind_speed_10m**: velocidad del viento a 10 metros del suelo.

In [4]:
df.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- relative_humidity_2m: double (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- wind_speed_10m: double (nullable = true)



- Genero una lista de días entre dos rangos de fechas para comprobar que no falten fechas en el dataframe original

In [5]:
start_date = datetime(2020, 1, 1)
end_date = datetime(2022, 1, 1)

date_range_list = [(start_date + timedelta(days=d, hours=h)).strftime("%Y-%m-%d %H:%M:%S") 
                  for d in range((end_date - start_date).days + 1) for h in range(24)]

# Crear un DataFrame en PySpark con las fechas en formato string
date_range_df = spark.createDataFrame([(d,) for d in date_range_list], ["date"])

# Convertir la columna "fecha" a tipo timestamp
date_range_df = date_range_df.withColumn("date", F.to_timestamp(F.col("date"), "yyyy-MM-dd HH:mm:ss"))

# Encontrar las fechas que están en el rango pero no en df_henares
missing_dates_df = date_range_df.subtract(df.select("date"))

In [6]:
# No faltan fechas
missing_dates_df.show()

+----+
|date|
+----+
+----+



In [7]:
# Agrupar por todas las columnas y contar
duplicados = df.groupBy(df.columns).count().filter(col("count") > 1)

# Mostrar los duplicados
if duplicados.count() > 0:
    print("Hay valores duplicados")
    duplicados.show()
else:
    print("No hay valores duplicados.")

No hay valores duplicados.


In [8]:
df.describe().show()

+-------+------------------+--------------------+-------------------+------------------+
|summary|    temperature_2m|relative_humidity_2m|      precipitation|    wind_speed_10m|
+-------+------------------+--------------------+-------------------+------------------+
|  count|             17568|               17568|              17568|             17568|
|   mean|15.099813379726596|   60.58100461316609|0.06039389799635743|10.542901945056364|
| stddev| 8.856151760575472|  24.814841148908254| 0.3648158052589198|   5.9798685253662|
|    min|          -11.1475|           6.2889233|                0.0|               0.0|
|    max|           41.4025|               100.0|               22.3|         41.439106|
+-------+------------------+--------------------+-------------------+------------------+



In [9]:
# Nulos de las columnas
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns if c != "date"]
   ).show()

+--------------+--------------------+-------------+--------------+
|temperature_2m|relative_humidity_2m|precipitation|wind_speed_10m|
+--------------+--------------------+-------------+--------------+
|             0|                   0|            0|             0|
+--------------+--------------------+-------------+--------------+



In [10]:
df.write.csv("datos_Madrid_horarios_transformado.csv", header = True, mode = "overwrite", timestampFormat="yyyy-MM-dd HH:mm:ss")

In [11]:
spark.stop()