In [1]:
data_path = "src/resources/data/movie.csv"

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("customer_look_alike_modelling").getOrCreate()
df = spark.read.option("header", "true").csv(data_path)
df.limit(5).show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/12 20:54:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+---+-------------+-----------------+-------------+--------------------------+----------------+------+
|Age|FrequentFlyer|AnnualIncomeClass|ServicesOpted|AccountSyncedToSocialMedia|BookedHotelOrNot|Target|
+---+-------------+-----------------+-------------+--------------------------+----------------+------+
| 34|           No|    Middle Income|            6|                        No|             Yes|     0|
| 34|          Yes|       Low Income|            5|                       Yes|              No|     1|
| 37|           No|    Middle Income|            3|                       Yes|              No|     0|
| 30|           No|    Middle Income|            2|                        No|              No|     0|
| 30|           No|       Low Income|            1|                        No|              No|     0|
+---+-------------+-----------------+-------------+--------------------------+----------------+------+



In [3]:
from pyspark.sql import functions as F
df = df.withColumnRenamed("BookedHotelOrNot", "BookedFoodOrNot")
df = df.withColumnRenamed("FrequentFlyer", "FrequentWatcher")
df = df.withColumn("Id", F.monotonically_increasing_id())
df = df.withColumn("temp", F.col("ServicesOpted") + 2)
df = df.drop("ServicesOpted").withColumn("ServicesOpted", F.col("temp")).drop("temp")
df.limit(20).show()

+---+---------------+-----------------+--------------------------+---------------+------+---+-------------+
|Age|FrequentWatcher|AnnualIncomeClass|AccountSyncedToSocialMedia|BookedFoodOrNot|Target| Id|ServicesOpted|
+---+---------------+-----------------+--------------------------+---------------+------+---+-------------+
| 34|             No|    Middle Income|                        No|            Yes|     0|  0|          8.0|
| 34|            Yes|       Low Income|                       Yes|             No|     1|  1|          7.0|
| 37|             No|    Middle Income|                       Yes|             No|     0|  2|          5.0|
| 30|             No|    Middle Income|                        No|             No|     0|  3|          4.0|
| 30|             No|       Low Income|                        No|             No|     0|  4|          3.0|
| 27|            Yes|      High Income|                        No|            Yes|     1|  5|          3.0|
| 34|             No|    Mid

In [4]:
df = df.withColumn("temp", F.col("ServicesOpted").cast("int"))
df = df.drop("ServicesOpted").withColumn("ServicesOpted", F.col("temp")).drop("temp")
df.show()

+---+---------------+-----------------+--------------------------+---------------+------+---+-------------+
|Age|FrequentWatcher|AnnualIncomeClass|AccountSyncedToSocialMedia|BookedFoodOrNot|Target| Id|ServicesOpted|
+---+---------------+-----------------+--------------------------+---------------+------+---+-------------+
| 34|             No|    Middle Income|                        No|            Yes|     0|  0|            8|
| 34|            Yes|       Low Income|                       Yes|             No|     1|  1|            7|
| 37|             No|    Middle Income|                       Yes|             No|     0|  2|            5|
| 30|             No|    Middle Income|                        No|             No|     0|  3|            4|
| 30|             No|       Low Income|                        No|             No|     0|  4|            3|
| 27|            Yes|      High Income|                        No|            Yes|     1|  5|            3|
| 34|             No|    Mid

In [5]:
df.columns

['Age',
 'FrequentWatcher',
 'AnnualIncomeClass',
 'AccountSyncedToSocialMedia',
 'BookedFoodOrNot',
 'Target',
 'Id',
 'ServicesOpted']

In [6]:
rows_to_convert = df.columns
rows_to_convert.remove("Id")
rows_to_convert.remove("Target")
rows_to_convert

['Age',
 'FrequentWatcher',
 'AnnualIncomeClass',
 'AccountSyncedToSocialMedia',
 'BookedFoodOrNot',
 'ServicesOpted']

In [7]:
",".join(rows_to_convert)

'Age,FrequentWatcher,AnnualIncomeClass,AccountSyncedToSocialMedia,BookedFoodOrNot,ServicesOpted'

In [8]:
df.groupBy("Target").count().show()

+------+-----+
|Target|count|
+------+-----+
|     0|  730|
|     1|  224|
+------+-----+



In [9]:
df.show()

+---+---------------+-----------------+--------------------------+---------------+------+---+-------------+
|Age|FrequentWatcher|AnnualIncomeClass|AccountSyncedToSocialMedia|BookedFoodOrNot|Target| Id|ServicesOpted|
+---+---------------+-----------------+--------------------------+---------------+------+---+-------------+
| 34|             No|    Middle Income|                        No|            Yes|     0|  0|            8|
| 34|            Yes|       Low Income|                       Yes|             No|     1|  1|            7|
| 37|             No|    Middle Income|                       Yes|             No|     0|  2|            5|
| 30|             No|    Middle Income|                        No|             No|     0|  3|            4|
| 30|             No|       Low Income|                        No|             No|     0|  4|            3|
| 27|            Yes|      High Income|                        No|            Yes|     1|  5|            3|
| 34|             No|    Mid

In [10]:
df.drop("Target").toPandas().to_csv("src/resources/data/movie_master.csv", header=True, index=False)

In [11]:
from src.utils.functions import get_row_as_text, hf_embeddings, get_ars_retrieved_df
train_df = get_row_as_text(df, rows_to_convert)

In [12]:
train_df.select("row_as_text").show(2, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------+
|row_as_text                                                                                                                           |
+--------------------------------------------------------------------------------------------------------------------------------------+
|Age: 34; FrequentWatcher: No; AnnualIncomeClass: Middle Income; AccountSyncedToSocialMedia: No; BookedFoodOrNot: Yes; ServicesOpted: 8|
|Age: 34; FrequentWatcher: Yes; AnnualIncomeClass: Low Income; AccountSyncedToSocialMedia: Yes; BookedFoodOrNot: No; ServicesOpted: 7  |
+--------------------------------------------------------------------------------------------------------------------------------------+
only showing top 2 rows



In [13]:
train_df.printSchema()

root
 |-- Age: string (nullable = true)
 |-- FrequentWatcher: string (nullable = true)
 |-- AnnualIncomeClass: string (nullable = true)
 |-- AccountSyncedToSocialMedia: string (nullable = true)
 |-- BookedFoodOrNot: string (nullable = true)
 |-- Target: string (nullable = true)
 |-- Id: long (nullable = false)
 |-- ServicesOpted: integer (nullable = true)
 |-- row_as_text: string (nullable = false)



In [14]:
step = 1500
k = 4000

texts_list = train_df.rdd.collect()
# texts_list = [x[0] for x in texts_label_list]
# texts_list
texts_list[0]

Row(Age='34', FrequentWatcher='No', AnnualIncomeClass='Middle Income', AccountSyncedToSocialMedia='No', BookedFoodOrNot='Yes', Target='0', Id=0, ServicesOpted=8, row_as_text='Age: 34; FrequentWatcher: No; AnnualIncomeClass: Middle Income; AccountSyncedToSocialMedia: No; BookedFoodOrNot: Yes; ServicesOpted: 8')

In [65]:
import os

# Specify the path of the directory you want to create
db_dir = "src/resources/embeddings/movie"

# Create the directory
os.makedirs(db_dir, exist_ok=True)

In [66]:
from langchain.vectorstores import Chroma
from src.utils.functions import hf_embeddings
import chromadb
import os

client = chromadb.PersistentClient(path=db_dir)

vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings,
                 collection_metadata={"hnsw:space": "cosine"}, client=client)

In [67]:
from langchain.vectorstores import Chroma
vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings, collection_metadata={"hnsw:space": "cosine"})
for i in range(0, len(texts_list), step):
    texts = [x.row_as_text for x in texts_list[i:i+step]]
    metadata = [{"Id": str(x.Id), "Target": str(x.Target)} for x in texts_list[i:i+step]]
    vdb.add_texts(texts, metadata)

vdb.persist()

OperationalError: attempt to write a readonly database

In [None]:
from langchain.vectorstores import Chroma
from src.utils.functions import hf_embeddings
import chromadb

client = chromadb.PersistentClient(path=db_dir)

vdb = Chroma(persist_directory=db_dir, embedding_function=hf_embeddings,
                 collection_metadata={"hnsw:space": "cosine"}, client=client)

In [None]:
vdb._collection.count()

In [24]:
# db_dir

In [25]:
# train_df.filter(F.col("Target") == 1).drop("row_as_text").toPandas().to_csv("src/resources/data/movie_test.csv", header=True, index=False)

In [26]:
# spark.read.option("header", "true").csv("src/resources/data/movie_test.csv").show()

In [None]:
df_input = train_df.drop("row_as_text")
df_input.show()

In [38]:
df_input = df_input.filter(F.col("Target") == 1).limit(10).union(df_input.filter(F.col("Target") == 0).limit(10))

In [39]:
df_input.show()

+---+---------------+-----------------+-------------+--------------------------+---------------+------+---+
|Age|FrequentWatcher|AnnualIncomeClass|ServicesOpted|AccountSyncedToSocialMedia|BookedFoodOrNot|Target| Id|
+---+---------------+-----------------+-------------+--------------------------+---------------+------+---+
| 34|            Yes|       Low Income|            5|                       Yes|             No|     1|  1|
| 27|            Yes|      High Income|            1|                        No|            Yes|     1|  5|
| 34|             No|       Low Income|            2|                       Yes|             No|     1|  7|
| 36|            Yes|      High Income|            1|                        No|             No|     1|  9|
| 28|             No|    Middle Income|            2|                        No|             No|     1| 11|
| 37|             No|       Low Income|            2|                       Yes|             No|     1| 19|
| 31|            Yes|      H

In [40]:
df_input.drop("Target").toPandas().to_csv("src/resources/data/movie_test.csv", header=True, index=False)

In [37]:
df_input.groupBy("Target").count().show()

+------+-----+
|Target|count|
+------+-----+
|     0|  730|
|     1|  224|
+------+-----+

