<a href="https://colab.research.google.com/github/sanjaynishanth/pyspark-tutorial/blob/main/pysparksimalritysearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

preprocess using Minmaxscaler

In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.sql.functions import lit
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType


spark = SparkSession.builder \
    .appName("Preprocessing with MinMax Scaling") \
    .getOrCreate()


df = spark.read.csv("/content/Mall_Customers.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Annual Income (k$): integer (nullable = true)
 |-- Spending Score (1-100): integer (nullable = true)



In [21]:
gender_indexer = StringIndexer(inputCol="Gender", outputCol="Gender_Indexed")
df_indexed = gender_indexer.fit(df).transform(df)


In [22]:
df_indexed = df_indexed.withColumn("Age", df_indexed["Age"].cast("double")) \
                       .withColumn("Annual Income (k$)", df_indexed["Annual Income (k$)"].cast("double")) \
                       .withColumn("Spending Score (1-100)", df_indexed["Spending Score (1-100)"].cast("double"))

In [23]:
input_columns = ["Gender_Indexed", "Age", "Annual Income (k$)", "Spending Score (1-100)"]
assembler = VectorAssembler(inputCols=input_columns, outputCol="features")
df_assembled = assembler.transform(df_indexed)

In [24]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_assembled)
scaled_data = scaler_model.transform(df_assembled)

In [25]:
scaled_data.select("CustomerID", "Gender", "Age", "Annual Income (k$)", "Spending Score (1-100)", "scaledFeatures").show(truncate=False)

+----------+------+----+------------------+----------------------+-----------------------------------------------------------------+
|CustomerID|Gender|Age |Annual Income (k$)|Spending Score (1-100)|scaledFeatures                                                   |
+----------+------+----+------------------+----------------------+-----------------------------------------------------------------+
|1         |Male  |19.0|15.0              |39.0                  |[1.0,0.019230769230769232,0.0,0.3877551020408163]                |
|2         |Male  |21.0|15.0              |81.0                  |[1.0,0.057692307692307696,0.0,0.8163265306122448]                |
|3         |Female|20.0|16.0              |6.0                   |[0.0,0.038461538461538464,0.00819672131147541,0.0510204081632653]|
|4         |Female|23.0|16.0              |77.0                  |[0.0,0.09615384615384616,0.00819672131147541,0.7755102040816326] |
|5         |Female|31.0|17.0              |40.0                  |[0.