In [26]:
data_path = "src/resources/data/credit_card.csv"

In [27]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("customer_look_alike_modelling").getOrCreate()
df = spark.read.option("header", "true").csv(data_path)
df.limit(5).show()

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|Naive_Bayes_Classifier_Attrit

In [28]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType
df = df.withColumnRenamed("CLIENTNUM", "customer_id")
df = df.drop("Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2")
df = df.drop("Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1")
df.show()


+-----------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+
|customer_id|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+-----------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---

In [29]:
df.columns

['customer_id',
 'Attrition_Flag',
 'Customer_Age',
 'Gender',
 'Dependent_count',
 'Education_Level',
 'Marital_Status',
 'Income_Category',
 'Card_Category',
 'Months_on_book',
 'Total_Relationship_Count',
 'Months_Inactive_12_mon',
 'Contacts_Count_12_mon',
 'Credit_Limit',
 'Total_Revolving_Bal',
 'Avg_Open_To_Buy',
 'Total_Amt_Chng_Q4_Q1',
 'Total_Trans_Amt',
 'Total_Trans_Ct',
 'Total_Ct_Chng_Q4_Q1',
 'Avg_Utilization_Ratio']

In [30]:

# rename to cols_to_convert
# rows_to_convert = "job_titles_cont,city_pl".split(",")
rows_to_convert = df.columns
rows_to_convert.remove("customer_id")
rows_to_convert.remove("Attrition_Flag")
rows_to_convert

['Customer_Age',
 'Gender',
 'Dependent_count',
 'Education_Level',
 'Marital_Status',
 'Income_Category',
 'Card_Category',
 'Months_on_book',
 'Total_Relationship_Count',
 'Months_Inactive_12_mon',
 'Contacts_Count_12_mon',
 'Credit_Limit',
 'Total_Revolving_Bal',
 'Avg_Open_To_Buy',
 'Total_Amt_Chng_Q4_Q1',
 'Total_Trans_Amt',
 'Total_Trans_Ct',
 'Total_Ct_Chng_Q4_Q1',
 'Avg_Utilization_Ratio']

In [31]:
",".join(rows_to_convert)

'Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio'

In [32]:
df.count()

10127

In [33]:
from src.utils.functions import get_row_as_text, hf_embeddings, get_ars_retrieved_df
train_df = get_row_as_text(df, rows_to_convert)

In [34]:
train_df.select("row_as_text").show(2, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|row_as_text                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+---------------------------------------------------------------------------------

In [35]:
train_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- Attrition_Flag: string (nullable = true)
 |-- Customer_Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Dependent_count: string (nullable = true)
 |-- Education_Level: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Income_Category: string (nullable = true)
 |-- Card_Category: string (nullable = true)
 |-- Months_on_book: string (nullable = true)
 |-- Total_Relationship_Count: string (nullable = true)
 |-- Months_Inactive_12_mon: string (nullable = true)
 |-- Contacts_Count_12_mon: string (nullable = true)
 |-- Credit_Limit: string (nullable = true)
 |-- Total_Revolving_Bal: string (nullable = true)
 |-- Avg_Open_To_Buy: string (nullable = true)
 |-- Total_Amt_Chng_Q4_Q1: string (nullable = true)
 |-- Total_Trans_Amt: string (nullable = true)
 |-- Total_Trans_Ct: string (nullable = true)
 |-- Total_Ct_Chng_Q4_Q1: string (nullable = true)
 |-- Avg_Utilization_Ratio: string (nullable 

In [36]:
step = 1000
k = 4000

texts_list = train_df.rdd.collect()
# texts_list = [x[0] for x in texts_label_list]
# texts_list
texts_list[0]

Row(customer_id='768805383', Attrition_Flag='Existing Customer', Customer_Age='45', Gender='M', Dependent_count='3', Education_Level='High School', Marital_Status='Married', Income_Category='$60K - $80K', Card_Category='Blue', Months_on_book='39', Total_Relationship_Count='5', Months_Inactive_12_mon='1', Contacts_Count_12_mon='3', Credit_Limit='12691', Total_Revolving_Bal='777', Avg_Open_To_Buy='11914', Total_Amt_Chng_Q4_Q1='1.335', Total_Trans_Amt='1144', Total_Trans_Ct='42', Total_Ct_Chng_Q4_Q1='1.625', Avg_Utilization_Ratio='0.061', row_as_text='Customer_Age: 45; Gender: M; Dependent_count: 3; Education_Level: High School; Marital_Status: Married; Income_Category: $60K - $80K; Card_Category: Blue; Months_on_book: 39; Total_Relationship_Count: 5; Months_Inactive_12_mon: 1; Contacts_Count_12_mon: 3; Credit_Limit: 12691; Total_Revolving_Bal: 777; Avg_Open_To_Buy: 11914; Total_Amt_Chng_Q4_Q1: 1.335; Total_Trans_Amt: 1144; Total_Trans_Ct: 42; Total_Ct_Chng_Q4_Q1: 1.625; Avg_Utilization_R

In [37]:
import os

# Specify the path of the directory you want to create
db_dir = "src/resources/embeddings/credit"

# Create the directory
os.makedirs(db_dir, exist_ok=True)

In [38]:
from langchain.vectorstores import Chroma
from src.utils.functions import hf_embeddings
import chromadb
import os

client = chromadb.PersistentClient(path=db_dir)

vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings,
                 collection_metadata={"hnsw:space": "cosine"}, client=client)

In [39]:
from langchain.vectorstores import Chroma
vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings, collection_metadata={"hnsw:space": "cosine"})
for i in range(0, len(texts_list), step):
    texts = [x.row_as_text for x in texts_list[i:i+step]]
    metadata = [{"customer_id": str(x.customer_id)} for x in texts_list[i:i+step]]
    vdb.add_texts(texts, metadata)
    vdb.persist()

In [40]:
from langchain.vectorstores import Chroma
from src.utils.functions import hf_embeddings
import chromadb

client = chromadb.PersistentClient(path=db_dir)

vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings,
                 collection_metadata={"hnsw:space": "cosine"}, client=client)

In [41]:
vdb._collection.count()

10127

In [42]:
train_df.count()

10127

In [44]:
df.drop("Attrition_Flag").toPandas().to_csv("src/resources/data/credit_master.csv", header=True, index=False)

In [43]:
df.groupBy("Attrition_Flag").count().show()

+-----------------+-----+
|   Attrition_Flag|count|
+-----------------+-----+
|Existing Customer| 8500|
|Attrited Customer| 1627|
+-----------------+-----+



In [None]:
# spark.read.option("header", "true").csv("src/resources/data/movie_test.csv").show()

In [47]:
df.filter(F.col("Attrition_Flag") == "Attrited Customer").limit(100).drop("Attrition_Flag").toPandas().to_csv("src/resources/data/credit_test.csv")