In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.pipeline import Pipeline


In [2]:
spark = SparkSession.builder\
        .appName("demo02")\
        .getOrCreate()

24/12/27 10:49:37 WARN Utils: Your hostname, nilesh-pc resolves to a loopback address: 127.0.1.1; using 192.168.1.101 instead (on interface wlp0s20f3)
24/12/27 10:49:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/27 10:49:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = spark.read\
        .option("inferSchema", "true")\
        .option("header", "true")\
        .csv("file:///home/nilesh/dbda-aug24/BigData/day11/customers.csv")

data.printSchema()

root
 |-- CustID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Purchased: integer (nullable = true)



In [4]:
in_data = data.withColumnRenamed("Purchased", "label")
in_data.show(truncate=False, n=3)

+--------+------+---+------+-----+
|CustID  |Gender|Age|Salary|label|
+--------+------+---+------+-----+
|15624510|Male  |19 |19000 |0    |
|15810944|Male  |35 |20000 |0    |
|15668575|Female|26 |43000 |0    |
+--------+------+---+------+-----+
only showing top 3 rows



In [5]:
genderIndexer = StringIndexer()\
                    .setInputCol("Gender")\
                    .setOutputCol("GenderIndexed")

In [6]:
vectAssembler = VectorAssembler()\
                    .setInputCols(["Age", "Salary", "GenderIndexed"])\
                    .setOutputCol("features")

In [8]:
logisticModel = LogisticRegression()\
            .setFeaturesCol("features")\
            .setLabelCol("label")

In [10]:
pipeline = Pipeline(stages=[genderIndexer, vectAssembler, logisticModel])
print(pipeline)

Pipeline_7f58b7d7197b


In [11]:
train_df, test_df = in_data.randomSplit(weights=[0.8, 0.2], seed=2809)

In [12]:
model = pipeline.fit(train_df)
predictions = model.transform(test_df)

predictions.printSchema()
predictions.show(n=3, truncate=False)

24/12/27 10:58:23 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


root
 |-- CustID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- GenderIndexed: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)

+--------+------+---+------+-----+-------------+------------------+----------------------------------------+-----------------------------------------+----------+
|CustID  |Gender|Age|Salary|label|GenderIndexed|features          |rawPrediction                           |probability                              |prediction|
+--------+------+---+------+-----+-------------+------------------+----------------------------------------+-----------------------------------------+----------+
|15571059|Female|33 |41000 |0    |0.0          |[33.0,41000.0,0.0]|[3.365546577267626,-3.365546577267

In [13]:
accuracy = BinaryClassificationEvaluator()\
                .evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.9309090909090909


In [14]:
model.save("file:///tmp/model2")
print("Model is saved...")

                                                                                

Model is saved...


In [None]:
# see coeficients of model -- load model saved parquet file and display it
modelPath = "file:///tmp/model1/data"
df = spark.read.parquet(modelPath)

df.show(truncate=False)