# Build Features on the Sales Dataset in Spark

In [None]:
import sys
sys.path.append("..")
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, row_number   
from pyspark.ml.feature import StringIndexer, VectorAssembler

## Select the churn file 

In [None]:
inputFile = "../data/sales.csv"

## Create the Spark Session 

In [None]:
#create a SparkSession
spark = SparkSession \
       .builder \
       .master("local[*]") \
       .appName("SalesFeatureEngineering") \
       .getOrCreate()
# create a DataFrame using an inferred Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ",") \
       .csv(inputFile)

## Data Preparation
### Build new features from existing ones

In [None]:
df.printSchema()

df_window = Window.orderBy(col("sales").desc())
result_df = df.withColumn("salesrank", row_number().over(df_window))
result_df.show()
division_indexer = StringIndexer().setInputCol("division").setOutputCol("division_num").fit(result_df)
education_indexer = StringIndexer().setInputCol("level of education").setOutputCol("education_num").fit(result_df)


 ### Build the feature vector

In [None]:
featureCols = result_df.columns.copy()
featureCols.remove("division")
featureCols.remove("level of education")
featureCols.remove("sales")
print(featureCols)

### Build the feature Vector Assembler

In [None]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

## Do the Data Preparation

In [None]:

indexed_data = education_indexer.transform(division_indexer.transform(result_df))
labeledPointData = assembler.transform(indexed_data)
labeledPointData.show()


In [None]:
spark.stop()