In [0]:
%pip install mlflow=="2.19"

In [0]:
%run /Workspace/Users/saahiti.prayaga@gmail.com/test_end2end_ml_flow/00_setup

In [0]:
#Pandas-on-Spark provides a Pandas-like API on top of PySpark, allowing you to use familiar Pandas functions while leveraging the distributed computing power of Spark. We'll use the pandas on spark API to scale pandas code. The Pandas instructions will be converted in the spark engine under the hood and distributed at scale.
df = spark.table("mlops_churn_bronze_customers").pandas_api()
display(df)

# Feature Engineering, data preprocessing

In [0]:
%sql
SELECT * FROM mlops_churn_bronze_customers

In [0]:
df["internet_service"].value_counts().plot.pie()

In [0]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame

def clean_churn_features(data_psdf: DataFrame) -> DataFrame:
  """
  Simple cleaning function leveraging pandas API
  """

  # Convert some columns
  data_psdf = data_psdf.astype({"senior_citizen": "string"})
  data_psdf["senior_citizen"] = data_psdf["senior_citizen"].map({"1" : "Yes", "0" : "No"})

  data_psdf["total_charges"] = data_psdf["total_charges"].apply(lambda x: float(x) if x.strip() else 0)


  # Fill some missing numerical values with 0
  data_psdf = data_psdf.fillna({"tenure": 0.0})
  data_psdf = data_psdf.fillna({"monthly_charges": 0.0})
  data_psdf = data_psdf.fillna({"total_charges": 0.0})

  def sum_optional_services(df):
      """Count number of optional services enabled, like streaming TV"""
      cols = ["online_security", "online_backup", "device_protection", "tech_support",
              "streaming_tv", "streaming_movies"]
      return sum(map(lambda c: (df[c] == "Yes"), cols))

  data_psdf["num_optional_services"] = sum_optional_services(data_psdf)

  # Return the cleaned Spark dataframe
  return data_psdf.to_spark()

churn_features = clean_churn_features(df)
display(df)

Now we save the cleaned features and their labels as a DELTA LAKE TABLE 