# Build Features on the Sales Dataset in Spark

In [1]:
import sys
sys.path.append("..")
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, row_number   
from pyspark.ml.feature import StringIndexer, VectorAssembler

## Select the churn file 

In [2]:
inputFile = "hdfs:///data/sales.csv"

## Create the Spark Session 

In [3]:
#create a SparkSession
spark = SparkSession \
       .builder \
       .master("yarn") \
       .appName("SalesFeatureEngineering") \
       .getOrCreate()
# create a DataFrame using an inferred Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(inputFile)

## Data Preparation
### Build new features from existing ones

In [4]:
df.printSchema()

df_window = Window.orderBy(col("sales").desc())
result_df = df.withColumn("salesrank", row_number().over(df_window))
result_df.show()
division_indexer = StringIndexer().setInputCol("division").setOutputCol("division_num").fit(result_df)
education_indexer = StringIndexer().setInputCol("level of education").setOutputCol("education_num").fit(result_df)


root
 |-- division: string (nullable = true)
 |-- level of education: string (nullable = true)
 |-- training level: integer (nullable = true)
 |-- work experience: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- sales: integer (nullable = true)

+-----------------+------------------+--------------+---------------+------+------+---------+
|         division|level of education|training level|work experience|salary| sales|salesrank|
+-----------------+------------------+--------------+---------------+------+------+---------+
|         printers| bachelor's degree|             2|             14|136385|663851|        1|
|         printers|      some college|             3|             13|130663|655722|        2|
|  office supplies| bachelor's degree|             3|             13|143317|637779|        3|
|  office supplies|      some college|             3|             12|130435|632053|        4|
|      peripherals|      some college|             2|             12|12551

 ### Build the feature vector

In [5]:
featureCols = result_df.columns.copy()
featureCols.remove("division")
featureCols.remove("level of education")
featureCols.remove("sales")
print(featureCols)

['training level', 'work experience', 'salary', 'salesrank']


### Build the feature Vector Assembler

In [6]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

## Do the Data Preparation

In [7]:

indexed_data = education_indexer.transform(division_indexer.transform(result_df))
labeledPointData = assembler.transform(indexed_data)
labeledPointData.show()


+-----------------+------------------+--------------+---------------+------+------+---------+------------+-------------+--------------------+
|         division|level of education|training level|work experience|salary| sales|salesrank|division_num|education_num|            features|
+-----------------+------------------+--------------+---------------+------+------+---------+------------+-------------+--------------------+
|         printers| bachelor's degree|             2|             14|136385|663851|        1|         0.0|          3.0|[2.0,14.0,136385....|
|         printers|      some college|             3|             13|130663|655722|        2|         0.0|          0.0|[3.0,13.0,130663....|
|  office supplies| bachelor's degree|             3|             13|143317|637779|        3|         1.0|          3.0|[3.0,13.0,143317....|
|  office supplies|      some college|             3|             12|130435|632053|        4|         1.0|          0.0|[3.0,12.0,130435....|
|     

In [8]:
spark.stop()