In [0]:
#Problem Statement: Clustering Customers Based on Riskiness

#A bank wants to identify risky customers to optimize its lending policies and reduce potential defaults. Using customer financial and behavioral data, the goal is #to group customers into different risk levels: Low Risk, Medium Risk, and High Risk.

#Dataset Details:
#The bank has historical data on its customers, including the following features:

#Credit Score: A measure of the customer's creditworthiness.
#Outstanding Loan Amount: The total amount of loans the customer currently owes.
#Monthly Income: The customer’s average monthly income.
#Loan-to-Income Ratio (LTI): The ratio of total loans to monthly income.
#Payment History: The percentage of on-time payments made by the customer.
#Number of Defaults: The number of times the customer has defaulted on a loan in the past.

#Goal:
#Use K-Means Clustering to classify customers into different risk categories:

#Low Risk: Customers with high credit scores, low loan-to-income ratios, and good payment histories.
#Medium Risk: Customers with moderate credit scores and loan-to-income ratios but no severe defaults.
#High Risk: Customers with low credit scores, high loan-to-income ratios, and frequent defaults.

#-----------------------------------------------------

#Steps to Solve: 6 Steps
#Data Collection:
#Gather historical data for existing and past customers.
#Ensure features like credit score, income, and defaults are included.

#Data Preparation:
#Handle missing values (e.g., missing payment history or income).
#Normalize numerical features to scale them uniformly.

#Feature Selection:
#Select features like Credit Score, LTI Ratio, Payment History, and Defaults for clustering.

#K-Means Clustering:
#Use K=3 (Low Risk, Medium Risk, High Risk) to group customers.
#Train the K-Means model to form clusters.

#Interpret Results:
#Analyze the characteristics of each cluster.
#Label clusters based on riskiness.

#Deploy Insights:
#Flag high-risk customers for additional scrutiny.
#Offer personalized loan policies based on cluster assignments.


In [0]:
# Databricks notebook: Customer Risk Clustering with PySpark

# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col

# Step 1: Start a Spark session
#spark = SparkSession.builder.appName("CustomerRiskClustering").getOrCreate()

# Step 2: Load the CSV file into a DataFrame
#file_path = "/dbfs/FileStore/customer_risk_data.csv"  # Adjust the file path if needed
#data_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Display the first few rows of the DataFrame
data_df = spark.sql("SELECT * FROM risk_data_r")
data_df.show()


+----------+-----------+---------------+-------------+--------------+-----------+
|CustomerID|CreditScore|OutstandingLoan|MonthlyIncome|PaymentHistory|NumDefaults|
+----------+-----------+---------------+-------------+--------------+-----------+
|         1|        750|          10000|         5000|            98|          0|
|         2|        680|          20000|         4500|            95|          1|
|         3|        620|          50000|         4000|            85|          2|
|         4|        590|          60000|         3500|            75|          3|
|         5|        800|          15000|         6000|            99|          0|
|         6|        720|          25000|         4800|            96|          0|
|         7|        640|          40000|         4200|            88|          1|
|         8|        580|          70000|         3000|            70|          4|
|         9|        700|          30000|         4700|            94|          1|
|        10|    

In [0]:

# Step 3: Prepare the data for clustering
# Select relevant features and cast them to double type for compatibility
feature_columns = ["CreditScore", "OutstandingLoan", "MonthlyIncome", "PaymentHistory", "NumDefaults"]

data_df = data_df.select(
    *[col(c).cast("double").alias(c) for c in feature_columns]
)
#data_df: This is a DataFrame object in PySpark.
#select: This method is used to select specific columns or transform them in a DataFrame.
#col(c): The col function is used to reference a column c by its name in the DataFrame. c represents a string containing the column name.
#cast("double"): This method is used to cast the column c to the "double" data type (a type for floating-point numbers).
#alias(c): This is used to rename the column. In this case, it keeps the column name as c after casting.
#feature_columns: This is expected to be a list of column names you want to apply the transformation to. For example: ['column1', 'column2', 'column3'].
#If feature_columns = ['age', 'height', 'weight'], and the original DataFrame has these columns with other data types (e.g., strings or integers), the new DataFrame will contain the same columns but converted to the double data type.
# Combine features into a single vector column

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data_prepared = assembler.transform(data_df)


In [0]:

# Step 4: Apply K-Means Clustering
# Initialize the K-Means model with 3 clusters (Low, Medium, High Risk)
kmeans = KMeans(featuresCol="features", predictionCol="risk_cluster", k=3, seed=42)

# Train the model
kmeans_model = kmeans.fit(data_prepared)

# Get cluster centers (useful for understanding the clusters)
cluster_centers = kmeans_model.clusterCenters()    # This method returns the coordinates of the centroids of the clusters after the KMeans algorithm has been trained. The centroids represent the center of each cluster in the feature space.
print("Cluster Centers:")
for i, center in enumerate(cluster_centers): # for i, center in enumerate(cluster_centers):: This loop iterates over each cluster center, with i being the cluster index and center being the coordinates of that cluster's centroid.
    print(f"Cluster {i}: {center}")


Cluster Centers:
Cluster 0: [7.30e+02 2.00e+04 5.00e+03 9.64e+01 4.00e-01]
Cluster 1: [5.85e+02 6.50e+04 3.25e+03 7.25e+01 3.50e+00]
Cluster 2: [6.43333333e+02 4.50000000e+04 4.20000000e+03 8.76666667e+01
 1.66666667e+00]


#Cluster 0: Centroid coordinates are [730, 20000, 5000, 96.4, 0.4] for its features.
#Cluster 1: Centroid coordinates are [585, 65000, 3250, 72.5, 3.5].
#Cluster 2: Centroid coordinates are [643.33, 45000, 4200, 87.77, 1.67].
#Each centroid represents the "average" characteristics of the data points within that cluster. The KMeans algorithm clusters data points based on their similarity, with points in each cluster being close to the corresponding centroid.

In [0]:

# Step 5: Predict clusters for customers
predictions = kmeans_model.transform(data_prepared)

# Display the predictions with original features
predictions.select(*feature_columns, "risk_cluster").show()


+-----------+---------------+-------------+--------------+-----------+------------+
|CreditScore|OutstandingLoan|MonthlyIncome|PaymentHistory|NumDefaults|risk_cluster|
+-----------+---------------+-------------+--------------+-----------+------------+
|      750.0|        10000.0|       5000.0|          98.0|        0.0|           0|
|      680.0|        20000.0|       4500.0|          95.0|        1.0|           0|
|      620.0|        50000.0|       4000.0|          85.0|        2.0|           2|
|      590.0|        60000.0|       3500.0|          75.0|        3.0|           1|
|      800.0|        15000.0|       6000.0|          99.0|        0.0|           0|
|      720.0|        25000.0|       4800.0|          96.0|        0.0|           0|
|      640.0|        40000.0|       4200.0|          88.0|        1.0|           2|
|      580.0|        70000.0|       3000.0|          70.0|        4.0|           1|
|      700.0|        30000.0|       4700.0|          94.0|        1.0|      

In [0]:
#risk_cluster: The cluster label assigned by the KMeans algorithm, indicating the risk group the individual belongs to:
#Cluster 0: Likely low-risk individuals.
#Cluster 1: Likely medium-risk individuals.
#Cluster 2: Likely high-risk individuals.

In [0]:

# Step 6: Save the results
output_path = "/dbfs/FileStore/customer_risk_clusters.csv"  # Adjust the path if needed
predictions.select("CreditScore", "OutstandingLoan", "MonthlyIncome", "PaymentHistory", "NumDefaults", "risk_cluster")\
    .write.csv(output_path, header=True)

print(f"Results saved to: {output_path}")


Results saved to: /dbfs/FileStore/customer_risk_clusters.csv
