# Sales Data Binning

In [None]:
import sys
sys.path.append("..")
# Spark libs
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, lower, max, min
from pyspark.ml.feature import Bucketizer, QuantileDiscretizer
# helpers
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string

### Select the Imput File

In [None]:
inputFile = translate_to_file_string("../data/sales_for_data_cleaning.csv")

### Spark Session Creation

In [None]:
spark = (SparkSession
       .builder
       .appName("Sales Data Binning")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

### Create Dataframe from csv File

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)
print(df.printSchema())

Lower devision

In [None]:
df = df.withColumn("devision_lowerd",lower(col("division")))
print_df(df, 10)

Filter lines with missing values

In [None]:
print(df.count())
df_without_na = df.na.drop()
print(df_without_na.count())
print_df(df_without_na,10)

### Binning of Salary

In [None]:
max_salary = round(df.select(max(col("salary"))).collect()[0][0])
min_salary = round(df.select(min(col("salary"))).collect()[0][0])
splits = [i for i in range(min_salary,max_salary,round((max_salary-min_salary)/10))]
print(splits)
bucketizer = Bucketizer(splits=splits, inputCol="salary", outputCol="salary_bucket")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(df)
print_df(bucketedData,10)

In [None]:
discretizer = QuantileDiscretizer(numBuckets=3, inputCol="salary", outputCol="salary_bucket_quantile")

result = discretizer.fit(df).transform(df)
print_df(result,10)


In [None]:
spark.stop()