In [1]:
data_path = "src/resources/data/superstore.csv"

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("customer_look_alike_modelling").getOrCreate()
df = spark.read.option("header", "true").csv(data_path)
df.limit(5).show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/12 08:29:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/12 08:29:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+-----+----------+----------+--------------+------+-------+--------+-----------+-------+--------+---------+---------------+---------------+----------------+------------+-----------------+---------------+-------------------+-----------------+-----------------+--------+--------+
|   Id|Year_Birth| Education|Marital_Status|Income|Kidhome|Teenhome|Dt_Customer|Recency|MntWines|MntFruits|MntMeatProducts|MntFishProducts|MntSweetProducts|MntGoldProds|NumDealsPurchases|NumWebPurchases|NumCatalogPurchases|NumStorePurchases|NumWebVisitsMonth|Response|Complain|
+-----+----------+----------+--------------+------+-------+--------+-----------+-------+--------+---------+---------------+---------------+----------------+------------+-----------------+---------------+-------------------+-----------------+-----------------+--------+--------+
| 1826|      1970|Graduation|      Divorced| 84835|      0|       0|  6/16/2014|      0|     189|      104|            379|            111|             189|         2

In [3]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

current_date = spark.sql("SELECT CURRENT_DATE()").collect()[0][0]
df = df.withColumn("Curr_Date", F.lit(current_date)).withColumn("MonthsCustomer", F.months_between(F.col("Curr_Date"), F.to_date(F.col("DT_customer"), "M/d/yyyy")).cast(IntegerType())).drop("DT_customer", "Curr_Date")

In [4]:
df.columns

['Id',
 'Year_Birth',
 'Education',
 'Marital_Status',
 'Income',
 'Kidhome',
 'Teenhome',
 'Recency',
 'MntWines',
 'MntFruits',
 'MntMeatProducts',
 'MntFishProducts',
 'MntSweetProducts',
 'MntGoldProds',
 'NumDealsPurchases',
 'NumWebPurchases',
 'NumCatalogPurchases',
 'NumStorePurchases',
 'NumWebVisitsMonth',
 'Response',
 'Complain',
 'MonthsCustomer']

In [5]:

# rename to cols_to_convert
# rows_to_convert = "job_titles_cont,city_pl".split(",")
rows_to_convert = df.columns
rows_to_convert.remove("Id")
rows_to_convert.remove("Response")
rows_to_convert

['Year_Birth',
 'Education',
 'Marital_Status',
 'Income',
 'Kidhome',
 'Teenhome',
 'Recency',
 'MntWines',
 'MntFruits',
 'MntMeatProducts',
 'MntFishProducts',
 'MntSweetProducts',
 'MntGoldProds',
 'NumDealsPurchases',
 'NumWebPurchases',
 'NumCatalogPurchases',
 'NumStorePurchases',
 'NumWebVisitsMonth',
 'Complain',
 'MonthsCustomer']

In [13]:
",".join(rows_to_convert)

'Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,MonthsCustomer'

In [14]:
df.filter(F.col("Response") == 0).count(), df.filter(F.col("Response") == 1).count()

(1906, 334)

In [20]:
df.drop("Response").toPandas().to_csv("src/resources/data/superstore_master.csv", header=True, index=False)

In [17]:
from src.utils.functions import get_row_as_text, hf_embeddings, get_ars_retrieved_df
train_df = get_row_as_text(df, rows_to_convert)

In [28]:
train_df.select("row_as_text").show(2, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|row_as_text                                                                                                                                                                                                                                                                                                                                                                                            |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [29]:
train_df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Year_Birth: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Income: string (nullable = true)
 |-- Kidhome: string (nullable = true)
 |-- Teenhome: string (nullable = true)
 |-- Recency: string (nullable = true)
 |-- MntWines: string (nullable = true)
 |-- MntFruits: string (nullable = true)
 |-- MntMeatProducts: string (nullable = true)
 |-- MntFishProducts: string (nullable = true)
 |-- MntSweetProducts: string (nullable = true)
 |-- MntGoldProds: string (nullable = true)
 |-- NumDealsPurchases: string (nullable = true)
 |-- NumWebPurchases: string (nullable = true)
 |-- NumCatalogPurchases: string (nullable = true)
 |-- NumStorePurchases: string (nullable = true)
 |-- NumWebVisitsMonth: string (nullable = true)
 |-- Response: string (nullable = true)
 |-- Complain: string (nullable = true)
 |-- MonthsCustomer: integer (nullable = true)
 |-- row_as_text: string (nullable = 

In [31]:
step = 1000
k = 4000

texts_list = train_df.rdd.collect()
# texts_list = [x[0] for x in texts_label_list]
# texts_list
texts_list[0]

Row(Id='1826', Year_Birth='1970', Education='Graduation', Marital_Status='Divorced', Income='84835', Kidhome='0', Teenhome='0', Recency='0', MntWines='189', MntFruits='104', MntMeatProducts='379', MntFishProducts='111', MntSweetProducts='189', MntGoldProds='218', NumDealsPurchases='1', NumWebPurchases='4', NumCatalogPurchases='4', NumStorePurchases='6', NumWebVisitsMonth='1', Response='1', Complain='0', MonthsCustomer=116, row_as_text='Year_Birth: 1970; Education: Graduation; Marital_Status: Divorced; Income: 84835; Kidhome: 0; Teenhome: 0; Recency: 0; MntWines: 189; MntFruits: 104; MntMeatProducts: 379; MntFishProducts: 111; MntSweetProducts: 189; MntGoldProds: 218; NumDealsPurchases: 1; NumWebPurchases: 4; NumCatalogPurchases: 4; NumStorePurchases: 6; NumWebVisitsMonth: 1; Response: 1; Complain: 0; MonthsCustomer: 116')

In [33]:
import os

# Specify the path of the directory you want to create
db_dir = "src/resources/embeddings/superstore"

# Create the directory
os.makedirs(db_dir, exist_ok=True)

In [34]:
from langchain.vectorstores import Chroma
from src.utils.functions import hf_embeddings
import chromadb
import os

client = chromadb.PersistentClient(path=db_dir)

vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings,
                 collection_metadata={"hnsw:space": "cosine"}, client=client)

In [35]:
from langchain.vectorstores import Chroma
vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings, collection_metadata={"hnsw:space": "cosine"})
for i in range(0, len(texts_list), step):
    texts = [x.row_as_text for x in texts_list[i:i+step]]
    metadata = [{"Id": str(x.Id)} for x in texts_list[i:i+step]]
    vdb.add_texts(texts, metadata)

vdb.persist()

In [36]:
from langchain.vectorstores import Chroma
from src.utils.functions import hf_embeddings
import chromadb

client = chromadb.PersistentClient(path=db_dir)

vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings,
                 collection_metadata={"hnsw:space": "cosine"}, client=client)

In [37]:
vdb._collection.count()

2240

In [38]:
train_df.groupBy("Response").count().show()

'src/resources/embeddings/superstore'

In [41]:
train_df.filter(F.col("Response") == 1).drop("row_as_text").limit(50).toPandas().to_csv("src/resources/data/superstore_test.csv", header=True, index=False)

In [26]:
# spark.read.option("header", "true").csv("src/resources/data/movie_test.csv").show()

In [39]:
train_df.groupBy("Response").count().show()

+--------+-----+
|Response|count|
+--------+-----+
|       0| 1906|
|       1|  334|
+--------+-----+



In [11]:
df_input = spark.read.option("header", "true").csv("src/resources/data/superstore_test.csv")

In [19]:
df_input.drop("Response").toPandas().to_csv("src/resources/data/superstore_test.csv", header=True, index=False)