_In this notebook, we are exploring **VectorAssembler**_

In [0]:
# Import Libs

# General 
import pandas as pd

# Pyspark
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline

_In this experiment we would be tranforming numeric & Array (vector) columns_

In [0]:
# Create a data for this experiment #1
# Here we use 'Vector' to create a dense Vector

df1 = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
      (1, 20, 2.0,Vectors.dense([0.1,11.0,0.1]),   0.0),
      (3,40,2.0,Vectors.dense([0.3,10.0,0.1]),1.0),
      (4,20,2.0,Vectors.dense([0.5,15.0,0.1]),0.0),
      (5,60,6.0,Vectors.dense([0.8,11.0,0.1]),1.0)
    ],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

display(df1)

id,hour,mobile,userFeatures,clicked
0,18,1.0,"Map(vectorType -> dense, length -> 3, values -> List(0.0, 10.0, 0.5))",1.0
1,20,2.0,"Map(vectorType -> dense, length -> 3, values -> List(0.1, 11.0, 0.1))",0.0
3,40,2.0,"Map(vectorType -> dense, length -> 3, values -> List(0.3, 10.0, 0.1))",1.0
4,20,2.0,"Map(vectorType -> dense, length -> 3, values -> List(0.5, 15.0, 0.1))",0.0
5,60,6.0,"Map(vectorType -> dense, length -> 3, values -> List(0.8, 11.0, 0.1))",1.0


In [0]:
# Transforming only 'hour', 'mobile', 'userFeatures'
vec_Assembled = VectorAssembler(inputCols=['hour', 'mobile', 'userFeatures'], outputCol='Features', handleInvalid='skip')
df1_assembled = vec_Assembled.transform(df1)
display(df1_assembled)

id,hour,mobile,userFeatures,clicked,Features
0,18,1.0,"Map(vectorType -> dense, length -> 3, values -> List(0.0, 10.0, 0.5))",1.0,"Map(vectorType -> dense, length -> 5, values -> List(18.0, 1.0, 0.0, 10.0, 0.5))"
1,20,2.0,"Map(vectorType -> dense, length -> 3, values -> List(0.1, 11.0, 0.1))",0.0,"Map(vectorType -> dense, length -> 5, values -> List(20.0, 2.0, 0.1, 11.0, 0.1))"
3,40,2.0,"Map(vectorType -> dense, length -> 3, values -> List(0.3, 10.0, 0.1))",1.0,"Map(vectorType -> dense, length -> 5, values -> List(40.0, 2.0, 0.3, 10.0, 0.1))"
4,20,2.0,"Map(vectorType -> dense, length -> 3, values -> List(0.5, 15.0, 0.1))",0.0,"Map(vectorType -> dense, length -> 5, values -> List(20.0, 2.0, 0.5, 15.0, 0.1))"
5,60,6.0,"Map(vectorType -> dense, length -> 3, values -> List(0.8, 11.0, 0.1))",1.0,"Map(vectorType -> dense, length -> 5, values -> List(60.0, 6.0, 0.8, 11.0, 0.1))"


In [0]:
# Selecting only the dense & clicked features
display(df1_assembled.select('Features', 'clicked'))

Features,clicked
"Map(vectorType -> dense, length -> 5, values -> List(18.0, 1.0, 0.0, 10.0, 0.5))",1.0
"Map(vectorType -> dense, length -> 5, values -> List(20.0, 2.0, 0.1, 11.0, 0.1))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(40.0, 2.0, 0.3, 10.0, 0.1))",1.0
"Map(vectorType -> dense, length -> 5, values -> List(20.0, 2.0, 0.5, 15.0, 0.1))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(60.0, 6.0, 0.8, 11.0, 0.1))",1.0


_In this epxeriment we would be transforming categorical features as well_

In [0]:
# Create a dataset for Experiment #2

# Create a schema
schema = StructType().add("id","integer").add("name","string").add("qualification","string").add("age", "integer").add("gender", "string")


# Create a dummy Data
data = [
    (1,'John',"B.A.", 20, "Male"),
    (2,'Martha',"B.Com.", 20, "Female"),
    (3,'Mona',"B.Com.", 21, "Female"),
    (4,'Harish',"B.Sc.", 22, "Male"),
    (5,'Jonny',"B.A.", 22, "Male"),
    (6,'Maria',"B.A.", 23, "Female"),
    (7,'Monalisa',"B.A.", 21, "Female")
]

# Create a spark Dataframe
df2 = spark.createDataFrame(data=data, schema=schema)
display(df2)

id,name,qualification,age,gender
1,John,B.A.,20,Male
2,Martha,B.Com.,20,Female
3,Mona,B.Com.,21,Female
4,Harish,B.Sc.,22,Male
5,Jonny,B.A.,22,Male
6,Maria,B.A.,23,Female
7,Monalisa,B.A.,21,Female


In [0]:
# checking the frequency for qualification.
display(df2.groupBy('qualification').count())

# checking the frequency for Gender.
display(df2.groupBy('gender').count())

qualification,count
B.A.,4
B.Com.,2
B.Sc.,1


gender,count
Male,3
Female,4


In [0]:
# checking the frequency for Gender.
display(df2.groupBy('gender').count())

gender,count
Male,3
Female,4


In [0]:
# creating a qualification_indexer

qualification_indexer = StringIndexer(inputCol='qualification', outputCol='qualification_Index')
df2_indexedQualification = qualification_indexer.fit(df2).transform(df2)
display(df2_indexedQualification)


id,name,qualification,age,gender,qualification_Index
1,John,B.A.,20,Male,0.0
2,Martha,B.Com.,20,Female,1.0
3,Mona,B.Com.,21,Female,1.0
4,Harish,B.Sc.,22,Male,2.0
5,Jonny,B.A.,22,Male,0.0
6,Maria,B.A.,23,Female,0.0
7,Monalisa,B.A.,21,Female,0.0


In [0]:
# One Hot encoding for 'qualification' column using the previously used transformation. 
qualification_ohe = OneHotEncoder(inputCol='qualification_Index', outputCol='qualification_Ohe')
df2_oneQualification = qualification_ohe.fit(df2_indexedQualification).transform(df2_indexedQualification)
display(df2_oneQualification)

id,name,qualification,age,gender,qualification_Index,qualification_Ohe
1,John,B.A.,20,Male,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))"
2,Martha,B.Com.,20,Female,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))"
3,Mona,B.Com.,21,Female,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))"
4,Harish,B.Sc.,22,Male,2.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())"
5,Jonny,B.A.,22,Male,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))"
6,Maria,B.A.,23,Female,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))"
7,Monalisa,B.A.,21,Female,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))"


_In this experiment I am putting everything together_

In [0]:
# Create a dataset for this experiment#3
pdf = pd.DataFrame({
    'x1': ['a','a','b','b', 'b', 'c'],
    'x2': ['apple', 'orange', 'orange','orange', 'peach', 'peach'],
    'x3': [1, 1, 2, 2, 2, 4],
    'x4': [2.4, 2.5, 3.5, 1.4, 2.1,1.5],
    'y1': [1, 0, 1, 0, 0, 1],
    'y2': ['yes', 'no', 'no', 'yes', 'yes', 'yes']
})

df3 = spark.createDataFrame(pdf)
display(df3)

x1,x2,x3,x4,y1,y2
a,apple,1,2.4,1,yes
a,orange,1,2.5,0,no
b,orange,2,3.5,1,no
b,orange,2,1.4,0,yes
b,peach,2,2.1,0,yes
c,peach,4,1.5,1,yes


In [0]:
df3.printSchema()

root
 |-- x1: string (nullable = true)
 |-- x2: string (nullable = true)
 |-- x3: long (nullable = true)
 |-- x4: double (nullable = true)
 |-- y1: long (nullable = true)
 |-- y2: string (nullable = true)



In [0]:
# String indexing & One Hot encoding for Categorical columns. 
categorical_cols = [fields for (fields,dataType) in df3.dtypes if dataType=='string']
index_output_cols = [x + '_Index' for x in categorical_cols]
ohe_output_cols = [x + '_ohe' for x in categorical_cols]

string_indexer = StringIndexer(inputCols=categorical_cols, outputCols=index_output_cols) # note Cols is used here
onehot_encoding = OneHotEncoder(inputCols=index_output_cols, outputCols=ohe_output_cols)

In [0]:
# Vector Assembler

# filtering out the numerical columns
numeric_cols = [fields for (fields,dataTypes) in df3.dtypes if dataTypes!='string']

# combine the inputs to feed vectorAssembler 
assembler_input = numeric_cols + ohe_output_cols

# Apply Vector Assembler
Vec_Assembler = VectorAssembler(inputCols=assembler_input, outputCol='Features_to_Model')

In [0]:
# apply the pipeline

# Define the Stages
all_stages = [string_indexer,onehot_encoding, Vec_Assembler ]

Final_df = Pipeline(stages=all_stages).fit(df3).transform(df3)
display(Final_df)

x1,x2,x3,x4,y1,y2,x1_Index,x2_Index,y2_Index,x1_ohe,x2_ohe,y2_ohe,Features_to_Model
a,apple,1,2.4,1,yes,1.0,2.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 8, values -> List(1.0, 2.4, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0))"
a,orange,1,2.5,0,no,1.0,0.0,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 8, indices -> List(0, 1, 4, 5), values -> List(1.0, 2.5, 1.0, 1.0))"
b,orange,2,3.5,1,no,0.0,0.0,1.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(), values -> List())","Map(vectorType -> dense, length -> 8, values -> List(2.0, 3.5, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0))"
b,orange,2,1.4,0,yes,0.0,0.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 8, values -> List(2.0, 1.4, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0))"
b,peach,2,2.1,0,yes,0.0,1.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 8, values -> List(2.0, 2.1, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0))"
c,peach,4,1.5,1,yes,2.0,1.0,0.0,"Map(vectorType -> sparse, length -> 2, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 2, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 1, indices -> List(0), values -> List(1.0))","Map(vectorType -> dense, length -> 8, values -> List(4.0, 1.5, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0))"
