In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [None]:
spark = SparkSession.builder\
        .appName("demo01")\
        .getOrCreate()

In [None]:
data = spark.read\
        .option("inferSchema", "true")\
        .option("header", "true")\
        .csv("file:///home/nilesh/dbda-aug24/BigData/day11/customers.csv")

data.printSchema()

In [None]:
data1 = data
data1.show(truncate=False, n=3)

In [None]:
genderIndexer = StringIndexer()\
                    .setInputCol("Gender")\
                    .setOutputCol("GenderIndexed")\

data2 = genderIndexer.fit(data1).transform(data1)

data2.printSchema()
data2.show(truncate=False, n=3)

In [None]:
vectAssembler = VectorAssembler()\
                    .setInputCols(["Age", "Salary", "GenderIndexed"])\
                    .setOutputCol("features")

data3 = vectAssembler.transform(data2)

data3.printSchema()
data3.show(truncate=False, n=3)

In [None]:
data4 = data3.withColumnRenamed("Purchased", "label")

data4.printSchema()
data4.show(truncate=False, n=3)

In [10]:
train_df, test_df = data4.randomSplit(weights=[0.8, 0.2], seed=2809)

In [None]:
model = LogisticRegression()\
            .fit(train_df)

print(model)

In [None]:
predictions = model.transform(test_df)

predictions.printSchema()
predictions.show(n=3, truncate=False)

In [None]:
accuracy = BinaryClassificationEvaluator()\
                .evaluate(predictions)
print(f"Model Accuracy: {accuracy}")

In [None]:
model.save("file:///tmp/model1")
print("Model is saved...")

In [None]:
# see coeficients of model -- load model saved parquet file and display it
modelPath = "file:///tmp/model1/data"
df = spark.read.parquet(modelPath)

df.show(truncate=False)