# Sales Data Cleaning 

In [None]:
import sys
sys.path.append("..")
# Spark libs
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, lower, mean
# helpers
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string

### Select the Imput File

In [None]:
inputFile = translate_to_file_string("../data/sales_for_data_cleaning.csv")

### Spark Session Creation

In [None]:
spark = (SparkSession
       .builder
       .appName("Sales Data Cleaning")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

### Create Dataframe from csv File

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

Lower devision

In [None]:
df = df.withColumn("devision_lowerd",lower(col("division")))
print_df(df, 10)

Filter lines with missing values

In [None]:
print(df.count())
df_without_na = df.na.drop()
print(df_without_na.count())
print_df(df_without_na,10)

Fill missing values with avg

In [None]:
# calculate which columns have null values
for curr_col in df.columns : 
   print(curr_col, " Null values ",df.filter(col(curr_col).isNull()).count())
# only training level and work experience have Null values
# calc avg for Training Level and 
avg_training_level = df.select(mean(col("training level"))).collect()[0][0]
print (avg_training_level)
avg_work_experience = df.select(mean(col("work experience"))).collect()[0][0]
print (avg_work_experience)
#df_with_filled_nulls = df.na.fill(round(avg_training_level),subset=["training level"]).na.fill(round(avg_work_experience),subset=["work experience"])
#print_df(df_with_filled_nulls, 50)

df_with_filled_nulls = df.fillna({"work experience":round(avg_work_experience),"training level":round(avg_training_level)}) 

print_df(df_with_filled_nulls)


In [None]:
spark.stop()