# Ingest data from GitHub

In [0]:
%sh
rm -r /dbfs/ml_lab
mkdir /dbfs/ml_lab
wget -O /dbfs/ml_lab/customer1.csv https://raw.githubusercontent.com/sachin365123/CSV-files-for-Data-Science-and-Machine-Learning/refs/heads/main/Telco-Customer-Churn%20with%20Registry%20Models.csv

--2025-08-01 14:52:42--  https://raw.githubusercontent.com/sachin365123/CSV-files-for-Data-Science-and-Machine-Learning/refs/heads/main/Telco-Customer-Churn%20with%20Registry%20Models.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 189150 (185K) [text/plain]
Saving to: ‘/dbfs/ml_lab/customer1.csv’

     0K .......... .......... .......... .......... .......... 27%  696K 0s
    50K .......... .......... .......... .......... .......... 54%  982K 0s
   100K .......... .......... .......... .......... .......... 81% 3.20M 0s
   150K .......... .......... .......... ....                 100% 5.55M=0.1s

2025-08-01 14:52:43 (1.25 MB/s) - ‘/dbfs/ml_lab/customer1.csv’ saved [189150/189150]



# Read dataset

In [0]:
%python
from pyspark.sql.types import *
from pyspark.sql.functions import *
   
data = spark.read.format("csv").option("header", "true").load("dbfs:/ml_lab/customer1.csv")

# Display columns

In [0]:
data.columns

['SeniorCitizen',
 'PropertyValue',
 'tenure',
 'PhoneService',
 'MonthlyCharges',
 'TotalCharges']

# clean dataset: Remove Null values

In [0]:
data = data.dropna().select(col("PhoneService").astype("string"),
                          col("MonthlyCharges").astype("float"),
                          col("TotalCharges").astype("float"),
                          col("PropertyValue").astype("float"),
                          col("SeniorCitizen").astype("int"),
                          col("tenure").astype("int")
                          )
display(data.sample(0.2))

PhoneService,MonthlyCharges,TotalCharges,PropertyValue,SeniorCitizen,tenure
Yes,56.95,1889.5,3.0,0,34
No,42.3,1840.75,5.0,0,45
Yes,70.7,151.65,6.0,0,2
Yes,18.95,326.8,13.0,0,16
Yes,55.2,528.35,20.0,0,10
Yes,19.8,202.25,23.0,0,12
Yes,59.6,2970.3,26.0,0,49
Yes,55.3,1530.6,27.0,0,30
Yes,64.7,1093.1,31.0,0,17
Yes,95.5,181.65,33.0,1,2


# split my Data into Train and Test

In [0]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())

Training Rows: 4913  Testing Rows: 2119


# Segregate into Category and numerical columns

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
   
catFeature = "PhoneService"
numFeatures = ['MonthlyCharges', 'TotalCharges', 'PropertyValue',  'tenure']

# Define the feature engineering and model training algorithm steps

In [0]:
catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
featureVector = VectorAssembler(inputCols=["PhoneServiceIdx", "normalizedFeatures"], outputCol="Features")


#### using the StringIndexer from PySpark to convert a categorical feature into a numerical index. 

#### inputCol=catFeature: This is the name of the categorical column you want to index.

#### outputCol=catFeature + "Idx": The name of the new column that will hold the indexed values (integers representing each category).

🧠 Example:
#### If your DataFrame looks like:

#### tenure	MonthlyCharges	TotalCharges
#### 1	      29.85	          29.85
#### 34	    56.95	          1889.50

#### Then after applying VectorAssembler, the output will be:

| numericFeatures        |
| ---------------------- |
| \[1.0, 29.85, 29.85]   |
| \[34.0, 56.95, 1889.5] |


### Combines multiple numerical columns into a single vector column called "numericFeatures"
#### Scales the "numericFeatures" vector to a 0–1 range, producing "normalizedFeatures"
#### Combines both categorical (now indexed) and scaled numerical features into one feature vector "Features" to be used in the model.


# Logistic Regression

In [0]:
algo = LogisticRegression(labelCol="SeniorCitizen", featuresCol="Features", maxIter=10, regParam=0.3)

StringIndexer_bfa8c6cf5ad1

# Chain the steps as stages in a pipeline

In [0]:
pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])

#### You’re creating a sequence of transformations and a model training step, combined into a single pipeline. Here's what each stage likely represents:

#### catIndexer:
StringIndexer to convert a categorical column to an index.

#### numVector:
VectorAssembler to merge numeric columns into a single vector.

#### numScaler (optional but common):
Something like MinMaxScaler or StandardScaler to normalize the numeric features.

#### featureVector:
Another VectorAssembler that likely combines:

#### The scaled numeric vector

The indexed categorical feature(s) into a final features vector.

#### algo:
The ML model you’re training, like:

#### LogisticRegression()



# Use the pipeline to prepare data and fit the model algorithm

In [0]:
model = pipeline.fit(train)
print ("Model trained!")

Downloading artifacts:   0%|          | 0/45 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Model trained!


In [0]:
prediction = model.transform(test)
predicted = prediction.select("Features", "probability", col("prediction").astype("Int"), col("SeniorCitizen").alias("trueLabel"))
display(predicted)

Features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06069652120865399, 0.015259457439860666, 0.7058907228229938, 0.08450704225352113))","Map(vectorType -> dense, length -> 2, values -> List(0.8693606526012754, 0.13063934739872463))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06119402605502759, 6.47125808714733E-4, 0.46272054638588506, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(0.8699397317711933, 0.13006026822880667))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06119402605502759, 6.47125808714733E-4, 0.6590779738190097, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(0.8694173549622236, 0.1305826450377764))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06119402605502759, 6.47125808714733E-4, 0.9285714285714286, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(0.8686975099834627, 0.13130249001653727))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.061691549879994555, 6.529038495733589E-4, 0.5334376778599886, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(0.869716109618651, 0.13028389038134902))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06218905472636816, 0.003189405818753267, 0.386881047239613, 0.014084507042253521))","Map(vectorType -> dense, length -> 2, values -> List(0.8700726576286649, 0.12992734237133508))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06318408339770872, 0.01979511570090298, 0.1730221969265794, 0.11267605633802817))","Map(vectorType -> dense, length -> 2, values -> List(0.8706176180333846, 0.12938238196661545))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06517412176179649, 6.933490335351306E-4, 0.02618099032441662, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(0.870811423262365, 0.12918857673763495))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06517412176179649, 6.933490335351306E-4, 0.03856004553215709, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(0.8707787310053043, 0.12922126899469566))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.06616915043313705, 7.049048948426605E-4, 0.8524473534433694, 0.0))","Map(vectorType -> dense, length -> 2, values -> List(0.8685418800496828, 0.1314581199503172))",0,0


# Generate evaluation metrics

In [0]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
   
evaluator = MulticlassClassificationEvaluator(labelCol="SeniorCitizen", predictionCol="prediction")

# Simple accuracy

In [0]:
accuracy = evaluator.evaluate(prediction, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)
   

Accuracy: 0.8381311939594148


# Class metrics

In [0]:

labels = [0, 1]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
   
    # Precision
    precision = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                                    evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
   
    # Recall
    recall = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                             evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
   
    # F1 score
    f1 = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                         evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
   



Individual class metrics:
Class 0
	Precision: 0.8381311939594148
	Recall: 1.0
	F1 Score: 0.9119383825417201
Class 1
	Precision: 0.0
	Recall: 0.0
	F1 Score: 0.0


# Weighed (overall) metrics

In [0]:
overallPrecision = evaluator.evaluate(prediction, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(prediction, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(prediction, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1) 

Overall Precision: 0.7024638982878342
Overall Recall: 0.8381311939594148
Overall F1 Score: 0.7643240053771094


# now stop your running Cluster

#### --------------------------------------------------------------

# Select Experiments on the left pane.

####  Select the experiment generated with your notebook’s name and view the details page for the most recent experiment run.

####   Use the Register Model button to register the model that was logged in that experiment and when prompted, create a new model named Penguin Predictor.

####  When the model has been registered, view the Models page (in the navigation bar on the left) and select the Penguin Predictor model.
####  In the page for the Penguin Predictor model, use the Use model for inference button to create a new real-time endpoint with the following settings:
####  Model: Penguin Predictor
####  Model version: 1
####  Endpoint: predict-penguin
####  Compute size: Small
####  The serving endpoint is hosted in a new cluster, which it may take several minutes to create.


####   When the endpoint has been created, use the Query endpoint button at the top right to open an interface from which you can test the endpoint. 

#### Then in the test interface, on the Browser tab, enter the following JSON request and use the Send Request button to call the endpoint and generate a prediction.


##### 
#### Paste the sample data into "Serving" from left pane -> Serving endpoints -> Browser -> Request --> Send Request

## Sample Data

In [0]:

{"dataframe_records": [
        {   "PhoneService": "No",
            "MonthlyCharges": 29.85,
            "TotalCharges": 29.85,
            "PropertyValue": 2,
            "tenure": 1
        } 
    ]
}

### code copied from Query Endpoint option, once Register model is reday for "Use model for inference"


# Copy token from Datbricks Setting option 
## Setting -> User -> Developer -> Access tokens -> Manage -> Generate New Token

In [0]:

import os

# Replace with your actual Databricks token
os.environ["DATABRICKS_TOKEN"] = "dapi3d4cacdba3bea15fccaafb9f4eb6fde1"

In [0]:
import os
import requests
import numpy as np
import pandas as pd
import json

def create_tf_serving_json(data):
    return {'inputs': {name: data[name].tolist() for name in data.keys()} if isinstance(data, dict) else data.tolist()}

def score_model(dataset):
    url = 'https://adb-2596735487416449.9.azuredatabricks.net/serving-endpoints/End_Point_DevOps_Aug_01/invocations'
    headers = {'Authorization': f'Bearer {os.environ.get("DATABRICKS_TOKEN")}', 'Content-Type': 'application/json'}
    ds_dict = {'dataframe_split': dataset.to_dict(orient='split')} if isinstance(dataset, pd.DataFrame) else create_tf_serving_json(dataset)
    data_json = json.dumps(ds_dict, allow_nan=True)
    response = requests.request(method='POST', headers=headers, url=url, data=data_json)
    if response.status_code != 200:
        raise Exception(f'Request failed with status {response.status_code}, {response.text}')
    return response.json()

# Test above end points for sample input

In [0]:
import pandas as pd

# Define the input record
input_data = {
    "dataframe_records": [
        {
            "PhoneService": "No",
            "MonthlyCharges": 29.85,
            "TotalCharges": 29.85,
            "PropertyValue": 2,
            "tenure": 1
        }
    ]
}

# For this test data output is SeniorCitizen: 0

# Convert it to a DataFrame
df_input = pd.DataFrame(input_data["dataframe_records"])

display(df_input)

PhoneService,MonthlyCharges,TotalCharges,PropertyValue,tenure
No,29.85,29.85,2,1


# Call the score_model function

In [0]:
# Call the function
response = score_model(df_input)

# View the prediction result
print(response)

{'predictions': [0.0]}


# Delete the endpoint
## When the endpoint is not longer required, you should delete it to avoid unnecessary costs.

#### In the predict-penguin endpoint page, in the ⁝ menu, select Delete.