In [0]:
%pip install mlflow>=3.0 --upgrade

In [0]:
%pip install databricks-feature_engineering

In [0]:
%restart_python

In [0]:
import pandas as pd
import pickle
import joblib
import os
from datetime import datetime, date 
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.lightgbm
from sklearn.metrics import classification_report,roc_auc_score,f1_score
import numpy as np
from mlflow.models.signature import infer_signature

import numpy as np
from databricks.feature_engineering import FeatureEngineeringClient

from sklearn.preprocessing import LabelEncoder
import json
from mlflow.tracking import MlflowClient
import requests
from sklearn.metrics import accuracy_score

**loading data**

We use dbutils.widgets to parameterize the notebook. This allows the CSV path to be passed as a job parameter at runtime, enabling the notebook to execute automatically without any manual modifications.

In [0]:
# Create a widget to accept the training CSV path as a job parameter
dbutils.widgets.text("training_csv", "")

In [0]:
# Retrieve the training CSV path passed via Databricks job parameters
input_path = dbutils.widgets.get("training_csv")

In [0]:
print(input_path)

In [0]:
#loading csv
df=pd.read_csv(input_path)
df.shape

In [0]:
df.head()

In [0]:
# save training data
df[df['type'] == 'training'].to_csv("/Volumes/ispl_databricks/default/training/ff_bd/train.csv", index=False)

# Save validation data
df[df['type'] == 'validation'].to_csv("/Volumes/ispl_databricks/default/training/ff_bd/validation.csv", index=False)

In [0]:
# Export DataFrame column names and their data types to a CSV file for schema validation and auditing
df.dtypes.reset_index().to_csv("/Volumes/ispl_databricks/default/training/ff_bd/columns_datatypes.csv",index=False)

In [0]:
# converting object columns to category
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category')

In [0]:
# Convert 'status_date' column to datetime format (invalid values are coerced to NaT),
# and extract month-year (MM-YYYY) for time-based grouping or analysis
df['status_date'] = pd.to_datetime(df['status_date'], errors='coerce')
df['month_year'] = df['status_date'].dt.strftime('%m-%Y')

In [0]:
df.groupby('month_year')['target'].value_counts(normalize=True)

In [0]:
df['type'].value_counts(dropna=False)

In [0]:
# Function to calculate age from date of birth (DOB)
# Returns age if the person is 18 years or older; otherwise returns -1
# Invalid or improperly formatted dates are safely handled using try-except
def age(born): 
    try:
        born = datetime.strptime(born, "%d-%m-%Y").date() 
        today = date.today() 
        age = today.year - born.year - ((today.month,  
                                        today.day) < (born.month,  
                                                        born.day))
        if age>=18:
            return age
        else:
            return (-1) 
    except:
        return -1
    
df['bue_age'] =df['bue_dob'].apply(age)

In [0]:
# Derive percentage-based loan distribution features to normalize loan counts
# and capture portfolio composition across different loan types

# Percentage of open loans out of total loans
df['bue_perc_no_of_open_loans']=round(df['bue_no_of_open_loans']/df['bue_no_of_loans'],2)
# Percentage of open loans in the last 6 months out of total loans
df['bue_perc_no_of_open_loans_lst_6months']=round(df['bue_no_of_open_loans_lst_6months']/df['bue_no_of_loans'],2)
# Percentage of credit card loans out of total loans
df['bue_perc_no_of_cc_loans']=round(df['bue_no_of_cc_loans']/df['bue_no_of_loans'],2)
#Percentage of open credit card loans out of total open loans
df['bue_perc_no_of_cc_open_loans']=round(df['bue_no_of_cc_open_loans']/df['bue_no_of_open_loans'],2)
# Percentage of open auto loans out of total open loans
df['bue_perc_no_of_auto_open_loans']=round(df['bue_no_of_auto_open_loans']/df['bue_no_of_open_loans'],2)
# Percentage of consumer loans out of total loans
df['bue_perc_no_of_consumer_loans']=round(df['bue_no_of_consumer_loans']/df['bue_no_of_loans'],2)
# Percentage of open consumer loans out of total open loans
df['bue_perc_no_of_consumer_open_loans']=round(df['bue_no_of_consumer_open_loans']/df['bue_no_of_open_loans'],2)
# Percentage of personal loans out of total loans
df['bue_perc_no_of_personal_loans']=round(df['bue_no_of_personal_loans']/df['bue_no_of_loans'],2)
# Percentage of gold loans out of total loans
df['bue_perc_no_of_gold_loans']=round(df['bue_no_of_gold_loans']/df['bue_no_of_loans'],2)

In [0]:
# selecting numerical columns
df_num = df.select_dtypes(include=['float64','int64','int32'])
df_num.fillna(0,inplace=True)

In [0]:
# selecting categorical columns
df_cat = df.select_dtypes(include=['object','boolean','category'])

In [0]:
# Create a new DataFrame with numerical and categorical columns
base_df2 = pd.concat([df_num,df_cat],axis=1)
base_df2.shape

In [0]:
# dropping columns whose null value more than 40%
null_percentages = base_df2.isnull().mean() * 100
columns_to_drop = null_percentages[null_percentages >= 40].index
base_df2.drop(columns=columns_to_drop, inplace=True)
len(columns_to_drop)

In [0]:
base_df2.shape

In [0]:
# dropping irrelevant columns
base_4=base_df2.drop(columns={'bue_dob','bue_name','imsi'})

In [0]:
base_4.dtypes.reset_index().to_csv(r"/Volumes/ispl_databricks/default/training/ff_bd/columns_datatypes.csv",index=False)

In [0]:
base_4['phone_phoneFootprintStrengthOverall'].value_counts(dropna=False)

In [0]:
base_4['numberBillingType'].value_counts(dropna=False)

In [0]:
# Define a dictionary for column-specific allowed categories
allowed_categories = {
    'currentNetworkRegion': [
        'Andhra Pradesh', 'Karnataka', 'Tamil Nadu', 'Maharashtra', 'Gujarat',
        'Delhi', 'Mumbai', 'Kerala', 'Bihar and Jharkhand', 'Rajasthan',
        'Madhya Pradesh', 'UP-East', 'UP-West', 'Chennai', 'Orissa',
        'West Bengal', 'Kolkata', 'Punjab', 'Assam', 'Haryana',
        'North East', 'Jammu & Kashmir', 'Himachal Pradesh'
    ],
    'email_emailFootprintStrength': [
        'Low', 'Very Low', 'Medium', 'High', 'Very High'
    ],
    'bue_score_comments': [
        'J-High Risk', 'I-Medium Risk', 'K-High Risk', 'H-Medium Risk',
        'M-Very High Risk', 'G-Low Risk', 'F-Low Risk', 'L-Very High Risk',
        'E-Low Risk', 'D-Very Low Risk', 'C-Very Low Risk',
        'Not Scored: Sufficient History Not Available',
        'Not Scored: Not Enough Info available on the customer',
        'Not Scored: No Activity seen on the customer (Inactive)',
        'B-Very Low Risk', 'A-Very Low Risk',
        'Not Scored: Cannot be classified',
        'Not Scored: No Updates available in last 36 months',
        'Not Scored: More than 50 active Accounts found',
        'Not Scored: Only a Guarantor'
    ],
    'phone_phoneFootprintStrengthOverall': [
        'High', 'Medium', 'Low', 'Very High', 'Very Low'
    ],
    'whatsapp': ['Account Found', 'Account Not Found', 'Error'],
    'numberBillingType': ['prepaid', 'postpaid', 'unknown']
}

# Apply mapping for all specified columns with custom logic for whatsapp and numberBillingType
for col, allowed_vals in allowed_categories.items():
    if col in base_4.columns:
        if col == 'whatsapp':
            # For 'whatsapp', map unknown categories to 'Error'
            base_4[col] = base_4[col].apply(
                lambda x: x if x in allowed_vals else 'Error'
            )
        elif col == 'numberBillingType':
            # For 'numberBillingType', map unknown categories to 'unknown'
            base_4[col] = base_4[col].apply(
                lambda x: x if x in allowed_vals else 'unknown'
            )
        else:
            # General logic: map unknown categories to 'others'
            base_4[col] = base_4[col].apply(
                lambda x: x if x in allowed_vals else 'others'
            )


In [0]:
# dropping status
base_4 = base_4.drop(columns=['status'])

In [0]:
# extracting year month from status_date
base_4["year"] = base_4["status_date"].dt.year
base_4["month"] = base_4["status_date"].dt.month

In [0]:
# dropped status_date_month_year
base_4 = base_4.drop(columns=['status_date','month_year'])

In [0]:
# converting category to and object to numerical as our machine learning works only with numerical columns 
#expect for loan_id and type because we will use loan_id as primary key for feature store table and type for splitting into train live and test
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in base_4.columns:
    dtype = base_4[col].dtype

    if col in ["loan_id","type"]:
        continue

    if dtype.name in ["category", "object"]:
        le = LabelEncoder()
        base_4[col] = le.fit_transform(
            base_4[col].astype(str)
        )
        label_encoders[col] = le

**Feature Stores Table and Delta table creation**

Creating Feature Store Table as well as delta table 
feature store to easily fetch features during training and delta table saving our current preprocessed as a table that also contain target variable it will hepl in further stages of pipeline it will act as a saved features table we don;t have to  preprocess these features again and again we can simplu load this delta table and continue

In [0]:
#creating spark dataframe as while creating a delta table or feature store table we need spark dataframe
base_4_spark = spark.createDataFrame(base_4)

In [0]:
base_4_spark = base_4_spark.toDF(
  *[col.replace(" ", "_") for col in base_4_spark.columns]
)

In [0]:
fe = FeatureEngineeringClient()

In [0]:
base_4_spark = base_4_spark.dropDuplicates(["loan_id"])


In [0]:
# Drop the target column from the DataFrame used to create the Feature Store table,
# as Feature Store tables should contain only input features.
# The target column is retained in the Delta table for downstream processing .
# Duplicate records are removed based on 'loan_id', which serves as the primary key for the Feature Store table.
base_df_feature_store = base_4_spark.drop('target','type','target_source')
base_4_feature_store = base_df_feature_store.drop_duplicates(['loan_id'])
base_4_spark = base_4_spark.drop_duplicates(['loan_id'])

In [0]:
#creation of feature store table
fe.create_table(
  name="ispl_databricks.model_logs.bd_500_features_store_training",
  primary_keys=["loan_id"],
  df = base_4_feature_store,
  description="Base features for model training"
)

In [0]:
#writing current dataframe to feature store
fe.write_table(
    name = 'ispl_databricks.model_logs.bd_500_features_store_training',
    df = base_4_feature_store,
    mode = "merge"
)

In [0]:
# saving delta table to save preprocesses features will use this in further stages of pipeline
base_4_spark.write.format("delta") \
  .mode("overwrite") \
  .saveAsTable("ispl_databricks.model_logs.bd_500_features_sample_training")