# Sales Data Transformation

In [None]:
import sys
sys.path.append("..")
# Spark libs
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer, QuantileDiscretizer, VectorAssembler , Normalizer, StandardScaler, MinMaxScaler
# helpers
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string

### Select the Imput File

In [None]:
inputFile = translate_to_file_string("../data/sales.csv")

### Spark Session Creation

In [None]:
spark = (SparkSession
       .builder
       .appName("Sales Data Cleaning")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

### Create Dataframe from csv File

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(inputFile)
print(df.printSchema())

### Encoding of Quantitative Attributes

In [None]:
division_indexer = StringIndexer().setInputCol("division").setOutputCol("division_num").fit(df)
education_indexer = StringIndexer().setInputCol("level of education").setOutputCol("education_num").fit(df)
df_indexed = education_indexer.transform(division_indexer.transform(df))
education_encoder = OneHotEncoder().setInputCol("education_num").setOutputCol("education_cat_vector").setDropLast(False).fit(df_indexed)
df_encoded = education_encoder.transform(df_indexed)
print_df (df_encoded,10)

## Discretize sales

In [None]:
discretizer = QuantileDiscretizer(numBuckets=10, inputCol="sales", outputCol="sales_bucket_quantile")

df_dis = discretizer.fit(df).transform(df_encoded)
print_df(df_dis,10)

### Build labeled point semantic vector

In [None]:
feature_cols = ["training level","work experience","salary","sales_bucket_quantile","division_num","education_cat_vector"]
assembler =  VectorAssembler(outputCol="features", inputCols=list(feature_cols))
df_lp = assembler.transform(df_dis)
print_df(df_lp,10)

### Normalization 

In [None]:
normalizer = Normalizer(inputCol="features", outputCol="norm_features", p=1.0)
df_norm = normalizer.transform(df_lp)
print_df (df_norm,10)

### Standardization

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(df_norm)

# Normalize each feature to have unit standard deviation.
df_norm_scaled = scalerModel.transform(df_norm)
print_df (df_norm_scaled,10)

In [None]:
mm_scaler = MinMaxScaler(inputCol="features", outputCol="nn_features").fit(df_lp)
df_nn = mm_scaler.transform(df_lp)
print_df (df_nn,10)

In [None]:
spark.stop()