## Comprovació absència NA i Outliers df

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, expr, lit
import duckdb

In [None]:
sc = SparkSession.builder.getOrCreate()

In [21]:
paths = ['data_lake/compravenda_sup.parquet', 'data_lake/rent_price.parquet', 'data_lake/renda.parquet']
col_numeriques = {'data_lake/compravenda_sup.parquet': ['Nombre']
                  , 'data_lake/renda.parquet': ['Import_Euros']
                , 'data_lake/rent_price.parquet': ['Price']}

In [31]:
for path in paths:
    RDD = sc.read.parquet(path)
    
    # Verificar si hi ha valors NA
    na_count = RDD.na.drop().count()
    if na_count == RDD.count():
        print(f"No hi ha valors NA al DataFrame {path}")
    else:
        print(f"Hi ha {RDD.count() - na_count} valors NA al DataFrame {path}")

    # Verificar si hi ha valors duplicats
    RDD.distinct()
    print(f"Hi ha {RDD.count() - RDD.distinct().count()} valors duplicats al DataFrame {path} de {RDD.count()} valors totals")
    
    for columna in col_numeriques[path]:
        valor_min = RDD.rdd.filter(lambda f: '-' not in str(f[columna]))\
            .map(lambda x: (1, x[columna]))\
            .mapValues(lambda x: float(x) if type(x) == str else x)\
            .reduceByKey(lambda x, y: min(x,y)).collect()[0][1]
        valor_max = RDD.rdd.filter(lambda f: '-' not in str(f[columna]))\
            .map(lambda x: (1, float(x[columna])))\
            .reduceByKey(lambda x, y: max(x,y)).collect()[0][1]
        
        RDD = RDD.withColumn(columna, (RDD[columna] - valor_min)/ (valor_max - valor_min))
        
        # IQR método
        IQR = RDD.approxQuantile(columna, [0.25, 0.75], 0.001)
        IQR_values = IQR[1] - IQR[0]
        
        RDD = RDD.withColumn('IQR', lit(IQR_values))
                
        outliersremoved = RDD.filter(col(columna) >= IQR[0] - 1.5 * IQR_values)\
            .filter(col(columna) <= IQR[1] + 1.5 * IQR_values)
        print(f"El DataFrame {path} ha passat de tenir {RDD.count()} a {outliersremoved.count()} valors sense outliers")
    #outliersremoved.write.db("ruta_de_salida/" + path)

No hi ha valors NA al DataFrame data_lake/compravenda_sup.parquet
Hi ha 0 valors duplicats al DataFrame data_lake/compravenda_sup.parquet de 876 valors totals
El DataFrame data_lake/compravenda_sup.parquet ha passat de tenir 876 a 518 valors sense outliers
No hi ha valors NA al DataFrame data_lake/rent_price.parquet
Hi ha 0 valors duplicats al DataFrame data_lake/rent_price.parquet de 4622 valors totals
El DataFrame data_lake/rent_price.parquet ha passat de tenir 4622 a 4617 valors sense outliers
No hi ha valors NA al DataFrame data_lake/renda.parquet
Hi ha 0 valors duplicats al DataFrame data_lake/renda.parquet de 1068 valors totals
El DataFrame data_lake/renda.parquet ha passat de tenir 1068 a 1001 valors sense outliers


In [3]:
spark = SparkSession.builder\
    .config("spark.jars", "duckdb.jar") \
    .getOrCreate()

24/04/16 15:58:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [13]:
# rent_price
DF = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:duckdb:formatted_zone/rent_price.db") \
  .option("driver", "org.duckdb.DuckDBDriver") \
  .option("query", "SELECT * FROM RDD") \
  .load()

# verificar duplicats
print(f"Hi ha {DF.count() - DF.distinct().count()} valors duplicats al DataFrame 'formatted_zone/rent_price.db' de {DF.count()} valors totals")

# verificar NA
print(f'Hi ha {DF.count() - DF.na.drop().count()} NAs')

# escalar
min_max = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:duckdb:formatted_zone/rent_price.db") \
  .option("driver", "org.duckdb.DuckDBDriver") \
  .option("query", "SELECT min(Price), max(Price) FROM RDD") \
  .load()

Hi ha 0 valors duplicats al DataFrame 'formatted_zone/rent_price.db' de 4622 valors totals
Hi ha 0 NAs
