In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def create_spark_session():
    '''Create a spark session'''
    spark = SparkSession \
    .builder\
    .master("local") \
    .appName("MODEC") \
    .config("spark.executor.memory", "2gb") \
    .getOrCreate()
    return spark



def read_gold(spark, file_path):
    '''
    Read parquet files
    on the silver path

    Parameters:
    spark : Spark Session
    file_path (str): Path to input data
    '''
    try:
        df_data = spark.read.parquet(file_path)
        return df_data
    except IOError:
        print('read error')

In [None]:
spark = create_spark_session()

In [None]:
fact =read_gold(spark, 'datalake/gold/fact')

In [None]:
fact_pd = fact.toPandas()

In [None]:
fact_pd.describe()

In [None]:
sns.boxplot( y=fact_pd["vibration"] );
plt.show()

In [None]:
sns.boxplot( y=fact_pd["temperature"] );
plt.show()

In [None]:
sns.countplot(fact_pd['month']);

In [None]:
sns.countplot(fact_pd['weekday']);

In [None]:
fact_corr = fact_pd.iloc[:, 6:13]
f,ax = plt.subplots(figsize=(8, 8))
sns.heatmap(fact_corr.corr(), annot=True, linewidths=.4, fmt= '.2f',ax=ax)
plt.show()