In [1]:
# Global data variables
SANDBOX_NAME = 'fesc'# Sandbox Name
DATA_PATH = "/data/sandboxes/" + SANDBOX_NAME + "/data/"

 

# Análisis de Datos Exploratorio



## Análisis Univariante

In [2]:
from pyspark.sql import functions as F

In [7]:
online_df = spark.read.csv(DATA_PATH + 'online_retail.csv', sep=';', header=True, inferSchema=True)

In [8]:
online_df.show(2)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|01/12/2010 8:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|01/12/2010 8:26|     3,39|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 2 rows



In [9]:
# Respuesta
online_df_2 = online_df.withColumn('timestamp', F.unix_timestamp(F.col('InvoiceDate'), 'dd/MM/yyyy HH:mm'))
online_df_2.show(2)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country| timestamp|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|01/12/2010 8:26|     2,55|     17850|United Kingdom|1291191960|
|   536365|    71053| WHITE METAL LANTERN|       6|01/12/2010 8:26|     3,39|     17850|United Kingdom|1291191960|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+
only showing top 2 rows



In [10]:
# Respuesta
online_df_3 = online_df_2.withColumn('datetime', F.from_unixtime(F.col('timestamp')))
online_df_3.show(2)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+-------------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country| timestamp|           datetime|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+-------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|01/12/2010 8:26|     2,55|     17850|United Kingdom|1291191960|2010-12-01 08:26:00|
|   536365|    71053| WHITE METAL LANTERN|       6|01/12/2010 8:26|     3,39|     17850|United Kingdom|1291191960|2010-12-01 08:26:00|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+----------+-------------------+
only showing top 2 rows



In [None]:
# Respuesta
online_df.dtypes



Primero identifica variables cualitativas y cuantitativas.

In [None]:
# Respuesta
quantitative_vars = [c for c,t in online_df.dtypes if t in ['int', 'double']]
qualitative_vars = [c for c,t in online_df.dtypes if t in ['boolean', 'string']]

In [None]:
# Respuesta
quantitative_vars

In [None]:
# Respuesta
qualitative_vars



### Variables cuantitativas 

Calcula métricas para una única columna

In [None]:
# Respuesta
avgs = [F.avg(col).alias('avg_' + col) for col in quantitative_vars]
maxs = [F.max(col).alias('max_' + col) for col in quantitative_vars]
mins = [F.min(col).alias('min_' + col) for col in quantitative_vars]
stds = [F.stddev(col).alias('std_' + col) for col in quantitative_vars]

In [None]:
# Respuesta
operations = avgs + stds + maxs + mins 
operations

In [None]:
# Respuesta
results = online_df.select(operations).first()

for col in quantitative_vars:
    
    avg = results['avg_' + col]
    std = results['std_' + col]
    maxi = results['max_' + col]
    mini = results['min_' + col]
    
    print('{}: avg={}, std={}, min={}, max={}'.format(col, round(avg, 2), round(std, 2), mini, maxi))



### Variables cualitativas

Para variables cualitativas se calculan tablas de frecuencia.



Calcula la tabla de frecuencia de las columnas cualitativas, y ordénalas de mayor a menor.

In [None]:
# Respuesta
online_df.groupBy('Country').count().sort(F.col("count").desc()).show()

In [None]:
# Respuesta
online_df.groupBy('Country', 'InvoiceDate').count().sort(F.col('count').desc()).show()



## Análisis Multivariante



__Matriz de correlación__

In [None]:
# Respuesta
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics
import pandas as pd

In [None]:
# Respuesta
online_df.select(quantitative_vars).rdd.map(lambda v: Vectors.dense(v))

In [None]:
# Respuesta
corr_matrix = Statistics.corr(online_df.select(quantitative_vars).rdd.map(lambda v: Vectors.dense(v)), 
                              method='pearson')
corr_matrix



_Transforma la matriz en un DataFrame de pandas_

In [None]:
# Respuesta
df_corr_matrix = pd.DataFrame(corr_matrix, columns=quantitative_vars, index=quantitative_vars)
df_corr_matrix

In [None]:
# Respuesta
import numpy as np
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
mask

In [None]:
# Respuesta
df_corr_matrix_reduced = df_corr_matrix.mask(mask)
df_corr_matrix_reduced

In [None]:
# Respuesta
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
# Respuesta
%matplotlib inline

In [None]:
# Respuesta
plt.figure(figsize=(8,7))
sns.heatmap(df_corr_matrix, cmap='coolwarm', vmin=-1, vmax=1, annot=True, fmt='.2f')
plt.show()



# Valores Atípicos



### Detección de outliers para variables que siguen la distribución normal

In [None]:
# Respuesta
def remove_tukey_outliers(df, col):
    """
    Returns a new dataframe with outliers removed on column 'col' usting Tukey test
    """
    
    q1, q3 = df.approxQuantile(col, [0.25, 0.75], 0.01)
    IQR = q3 - q1
    
    min_thresh = q1 - 1.5 * IQR
    max_thresh = q3 + 1.5 * IQR
    
    df_no_outliers = df.filter(F.col(col).between(min_thresh, max_thresh))
    
    return df_no_outliers

In [None]:
# Respuesta
online_df_no_outliers = remove_tukey_outliers(online_df, 'Quantity')

In [None]:
# Respuesta
n_rows = online_df.count()

In [None]:
# Respuesta
n_rows_no = online_df_no_outliers.count()
perc_outliers = 100 * (n_rows - n_rows_no) / n_rows

In [None]:
# Respuesta
print('{} has {:.2f}% outliers'.format('Quantity', perc_outliers))



# Valores nulos

In [None]:
# Respuesta
def remove_nulls(df):
    df_no_nulls = df
    
    for element in df_no_nulls.columns:
        if df_no_nulls.where(df_no_nulls[element].isNull()).count() != 0:
            print('\tThe column "{}" has null values'.format(element))
            df_no_nulls = df_no_nulls.where(df_no_nulls[element].isNotNull())
        if df_no_nulls.where(df_no_nulls[element].isNull()).count() == 0:
            print('The column "{}" does not have null values'.format(element))
            
    return df_no_nulls

In [None]:
# Respuesta
def check_nulls(df):
    
    existing_nulls = False
    
    for element in df.columns:
        if df.where(df[element].isNull()).count() != 0:
            print('\tThe column "{}" has null values'.format(element))
            existing_nulls = True
            break
        if df.where(df[element].isNull()).count() == 0:
            print('The column "{}" does not have null values'.format(element))
            
    return existing_nulls

In [None]:
# Respuesta
print(online_df.count())
online_df_no_nulls = remove_nulls(online_df)
print(online_df_no_nulls.count())