In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("KMeans_Modeling").enableHiveSupport().getOrCreate()

sc = spark.sparkContext

# Step 1: Import required libraries

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Step 2: Initialize a Spark session

In [4]:
spark = SparkSession.builder.master("local").appName("ClusteringApp").getOrCreate()

# Step 3: Load the CSV file into a DataFrame

In [5]:
df = spark.read.csv("file:///home/talentum/shared/C-DAC_Project/Bank_Transaction_clean.csv", header=True, inferSchema=True)

df.show(5)
df.printSchema()

+-------------+----------+----------+------------------+-------------------+--------------------+-----------+------------+
|TransactionID|CustomerID|CustGender|CustAccountBalance|    TransactionDate|TransactionAmountINR|CustomerAge|CustLocation|
+-------------+----------+----------+------------------+-------------------+--------------------+-----------+------------+
|           T1|  C5841053|         F|          17819.05|2016-08-02 00:00:00|                25.0|         22|  JAMSHEDPUR|
|           T3|  C4417068|         F|          17874.44|2016-08-02 00:00:00|               459.0|         19|      MUMBAI|
|           T4|  C5342380|         F|          866503.2|2016-08-02 00:00:00|              2060.0|         43|      MUMBAI|
|           T5|  C9031234|         F|           6714.43|2016-08-02 00:00:00|              1762.5|         28|      MUMBAI|
|           T6|  C1536588|         F|           53609.2|2016-08-02 00:00:00|               676.0|         44|    ITANAGAR|
+-------------+-

# Step 4: Select the relevant columns for clustering

In [6]:
df = df.select("CustAccountBalance", "TransactionAmountINR", "CustomerAge")

# Step 5: Data Preparation

In [7]:
# Convert columns to a single feature vector
assembler = VectorAssembler(inputCols=df.columns, outputCol="features")
assembled_data = assembler.transform(df)

# Step 6: Train KMeans model with k=6

In [8]:
kmeans = KMeans(k=6, seed=42, featuresCol="features", predictionCol="prediction")
model = kmeans.fit(assembled_data)

# Step 7: Make predictions and add them to the DataFrame

In [9]:
predictions = model.transform(assembled_data)
predictions.show()

+------------------+--------------------+-----------+--------------------+----------+
|CustAccountBalance|TransactionAmountINR|CustomerAge|            features|prediction|
+------------------+--------------------+-----------+--------------------+----------+
|          17819.05|                25.0|         22|[17819.05,25.0,22.0]|         0|
|          17874.44|               459.0|         19|[17874.44,459.0,1...|         0|
|          866503.2|              2060.0|         43|[866503.2,2060.0,...|         2|
|           6714.43|              1762.5|         28|[6714.43,1762.5,2...|         0|
|           53609.2|               676.0|         44|[53609.2,676.0,44.0]|         0|
|            973.46|               566.0|         24| [973.46,566.0,24.0]|         0|
|          95075.54|               148.0|         34|[95075.54,148.0,3...|         0|
|          14906.96|               833.0|         28|[14906.96,833.0,2...|         0|
|           4279.22|              289.11|         32|[

# Step 8: Save the model locally

In [10]:
model.write().overwrite().save("file:///home/talentum/newp/Final_Model")

# Step 9: Predict the cluster for new user input

In [11]:
def predict_cluster(customerbalance, transactionamount, custlocation):
    # Create a DataFrame for the new data point
    new_data = spark.createDataFrame([(CustAccountBalance, TransactionAmountINR, CustomerAge)], 
                                     ["CustAccountBalance", "TransactionAmountINR", "CustomerAge"])
    # Transform the data
    new_data_transformed = assembler.transform(new_data)
    # Predict the cluster
    prediction = model.transform(new_data_transformed)
    return prediction.select("prediction").collect()[0][0]

# Example prediction for user input

In [13]:
CustAccountBalance= 866503.2
TransactionAmountINR = 2060.0
CustomerAge = 40
predicted_cluster = predict_cluster(CustAccountBalance, TransactionAmountINR, CustomerAge)
print(f"The predicted cluster for the user input is: {predicted_cluster}")

The predicted cluster for the user input is: 2
