In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("random.csv", inferSchema=True, header=True)

In [4]:
df.show()

+---+------+------+-----------+
|age|salary|weight|phone_price|
+---+------+------+-----------+
| 88| 70389|    67|       2577|
| 33| 67417|    50|       2823|
| 34| 85433|    53|       3889|
| 17| 59561|    67|       8168|
|  9| 40765|    67|       5536|
| 26| 38629|    56|       9887|
| 36| 65461|    16|       7963|
| 66| 35043|    54|       4247|
| 82| 66486|   100|       7640|
| 27| 76244|    28|      14348|
| 70| 66831|    18|       2662|
| 30| 40403|    18|      14887|
| 76| 84224|    57|       3989|
| 90| 34930|    32|       6256|
| 80| 15924|    87|       9385|
| 96| 53667|    31|       8926|
| 37| 55749|    75|       7339|
| 89| 99245|    40|       4948|
| 39| 94754|    22|       1873|
| 74| 38650|    41|       5996|
+---+------+------+-----------+
only showing top 20 rows



In [5]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- phone_price: integer (nullable = true)



In [6]:
from pyspark.ml.feature import StandardScaler

In [7]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [8]:
df.columns

['age', 'salary', 'weight', 'phone_price']

In [9]:
assembler= VectorAssembler(inputCols=['age', 'salary', 'weight', 'phone_price'],outputCol='features')

In [10]:
output= assembler.transform(df)

In [11]:
output.show()

+---+------+------+-----------+--------------------+
|age|salary|weight|phone_price|            features|
+---+------+------+-----------+--------------------+
| 88| 70389|    67|       2577|[88.0,70389.0,67....|
| 33| 67417|    50|       2823|[33.0,67417.0,50....|
| 34| 85433|    53|       3889|[34.0,85433.0,53....|
| 17| 59561|    67|       8168|[17.0,59561.0,67....|
|  9| 40765|    67|       5536|[9.0,40765.0,67.0...|
| 26| 38629|    56|       9887|[26.0,38629.0,56....|
| 36| 65461|    16|       7963|[36.0,65461.0,16....|
| 66| 35043|    54|       4247|[66.0,35043.0,54....|
| 82| 66486|   100|       7640|[82.0,66486.0,100...|
| 27| 76244|    28|      14348|[27.0,76244.0,28....|
| 70| 66831|    18|       2662|[70.0,66831.0,18....|
| 30| 40403|    18|      14887|[30.0,40403.0,18....|
| 76| 84224|    57|       3989|[76.0,84224.0,57....|
| 90| 34930|    32|       6256|[90.0,34930.0,32....|
| 80| 15924|    87|       9385|[80.0,15924.0,87....|
| 96| 53667|    31|       8926|[96.0,53667.0,3

In [12]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

In [13]:
scalerModel = scaler.fit(output)

In [14]:
# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(output)
scaledData.show()

+---+------+------+-----------+--------------------+--------------------+
|age|salary|weight|phone_price|            features|      scaledFeatures|
+---+------+------+-----------+--------------------+--------------------+
| 88| 70389|    67|       2577|[88.0,70389.0,67....|[3.19421932670098...|
| 33| 67417|    50|       2823|[33.0,67417.0,50....|[1.19783224751286...|
| 34| 85433|    53|       3889|[34.0,85433.0,53....|[1.23413019440719...|
| 17| 59561|    67|       8168|[17.0,59561.0,67....|[0.61706509720359...|
|  9| 40765|    67|       5536|[9.0,40765.0,67.0...|[0.32668152204896...|
| 26| 38629|    56|       9887|[26.0,38629.0,56....|[0.94374661925256...|
| 36| 65461|    16|       7963|[36.0,65461.0,16....|[1.30672608819585...|
| 66| 35043|    54|       4247|[66.0,35043.0,54....|[2.39566449502573...|
| 82| 66486|   100|       7640|[82.0,66486.0,100...|[2.97643164533500...|
| 27| 76244|    28|      14348|[27.0,76244.0,28....|[0.98004456614689...|
| 70| 66831|    18|       2662|[70.0,6

In [15]:
scaledData.select("scaledFeatures").show(truncate=False)

+-----------------------------------------------------------------------------+
|scaledFeatures                                                               |
+-----------------------------------------------------------------------------+
|[3.1942193267009857,2.85720140154862,2.513069759784753,0.7089136205740189]   |
|[1.1978322475128698,2.736563197206997,1.8754251938692186,0.7765863992551243] |
|[1.2341301944071992,3.4678612757462566,1.9879507055013717,1.0698351068732477]|
|[0.6170650972035996,2.417675669176112,2.513069759784753,2.246956326289711]   |
|[0.3266815220489645,1.654716150735619,2.513069759784753,1.5229126129211363]  |
|[0.9437466192525641,1.5680125153137796,2.100476217133525,2.7198404992686553] |
|[1.306726088195858,2.6571660479162116,0.60013606203815,2.190562344055457]    |
|[2.3956644950257395,1.4224510749473396,2.0254592093787562,1.1683182563359946]|
|[2.9764316453350097,2.6987724272736013,3.750850387738437,2.101707435461973]  |
|[0.9800445661468934,3.094865158755956,1