In [3]:
from pyspark import SparkContext, StorageLevel
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.sql.types import ArrayType, StringType, BooleanType, IntegerType, StructType, StructField, FloatType
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, StopWordsRemover
from transformers import pipeline
import pandas as pd

In [4]:
spark = SparkSession.builder.appName("WILDCHAT-1M") \
    .config('spark.driver.memory', '24g') \
    .config('spark.executor.memory', '12g') \
    .config('spark.sql.debug.maxToStringFields', 1000) \
    .config("spark.default.parallelism", "10") \
    .master('local[8]') \
    .config("spark.driver.maxResultSize", "10g") \
    .getOrCreate()
spark

24/11/24 01:04:44 WARN Utils: Your hostname, Sharans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.30 instead (on interface en0)
24/11/24 01:04:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/24 01:04:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/24 01:04:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [7]:
main_df = spark.read.parquet('/Users/sharan/Desktop/IDMP Data/*.parquet')
main_df.count()

990372

In [None]:
main_df.filter(F.col('country').isNull()).

In [4]:
main_df.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- openai_moderation: array (nu

In [5]:
copy_df = main_df

In [6]:
copy_df = copy_df.drop('openai_moderation', 'detoxify_moderation', 'header')
copy_df.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- toxic: boolean (nullable = t

In [7]:
copy_df.filter(F.col('redacted') == True).select(F.col('redacted')).distinct().collect()

[Row(redacted=True)]

In [8]:
copy_df = copy_df.filter((F.col('language') == "English") & 
               (F.col('toxic') == False) & 
               (F.col('redacted') == False))

In [9]:
copy_df.count()

473265

In [10]:
copy_df.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- toxic: boolean (nullable = t

In [11]:
copy_df = copy_df.drop('toxic', 'redacted')

In [12]:
copy_df = copy_df.withColumn('conversation_explode', F.explode(F.col("conversation"))) \
    .withColumn('prompt', F.col('conversation_explode.content')) \
    .withColumn('turn_identifier', F.col('conversation_explode.turn_identifier')) \
    .drop('conversation_explode')

In [13]:
copy_df.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- state: string (nullable = tr

In [14]:
copy_df.filter(F.col('prompt').isNull()).count() # no prompt is empty, the data is clean

                                                                                

0

In [15]:
process_data = copy_df

In [16]:
#Preprocess 1 : Check for null values

null_counts = process_data.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in process_data.columns])
null_counts.show()

                                                                                

+-----------------+-----+---------+------------+----+--------+------+-------+---------+------+---------------+
|conversation_hash|model|timestamp|conversation|turn|language| state|country|hashed_ip|prompt|turn_identifier|
+-----------------+-----+---------+------------+----+--------+------+-------+---------+------+---------------+
|                0|    0|        0|           0|   0|       0|289644|   1990|        0|     0|              0|
+-----------------+-----+---------+------------+----+--------+------+-------+---------+------+---------------+



In [17]:
#Preprocess 2 : Update Null Values to empty strings
process_data = process_data.fillna({'state' : " ", 'country' : " "})
process_data.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in process_data.columns]).show()



+-----------------+-----+---------+------------+----+--------+-----+-------+---------+------+---------------+
|conversation_hash|model|timestamp|conversation|turn|language|state|country|hashed_ip|prompt|turn_identifier|
+-----------------+-----+---------+------------+----+--------+-----+-------+---------+------+---------------+
|                0|    0|        0|           0|   0|       0|    0|      0|        0|     0|              0|
+-----------------+-----+---------+------------+----+--------+-----+-------+---------+------+---------------+



                                                                                

In [18]:
#Preprocess 3 : Cleaning the prompt to remove all special characters.

import re

@F.udf(StringType())
def cleanText(prompt):
    if prompt:
        #clean_text = re.sub('[^a-zA-Z0-9]', '', prompt)
        #clean_text = re.sub(' \\-*/=:,.&|^%$@!%', r'\1', clean_text)
        # Remove all non-alphanumeric characters except specific ones (.-/=,:,&|^%@!%)
        clean_text = re.sub(r'[^a-zA-Z0-9\.\*/=:,.&|^%@!# ]', '', prompt)
        clean_text = clean_text.lower()
        # Replace multiple consecutive special characters with a single one
        clean_text = re.sub(r'([\-*/=:,.&|^%@!]#)\1+', r' \1 ', clean_text)
        clean_text = re.sub(r'(\d)([a-z])', r'\1 \2', clean_text)
        clean_text = re.sub(r'([.?])', r' \1 ', clean_text)
        clean_text = re.sub(r'\s+', ' ', clean_text).strip() 
        clean_text = re.sub(r'[\n\t]', ' ', clean_text)
        clean_text = clean_text.replace('/', '') 
        return clean_text
    return ''


process_data = process_data.withColumn('clean', cleanText(F.col('prompt')))

In [19]:
#Preprocess 4 : Trim Clean Prompt
process_data = process_data.withColumn('clean', F.trim(F.col('clean')))

In [20]:
process_data.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- state: string (nullable = fa

In [21]:
process_data = process_data.drop('conversation')

In [22]:
process_data.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- state: string (nullable = false)
 |-- country: string (nullable = false)
 |-- hashed_ip: string (nullable = true)
 |-- prompt: string (nullable = true)
 |-- turn_identifier: long (nullable = true)
 |-- clean: string (nullable = true)



In [23]:
#Preprocess 5 : Combining User Prompt and Bot Response
groupCols = [col for col in process_data.columns if col != 'prompt' and col != 'clean']
process_data = process_data.groupBy(groupCols).agg(F.concat_ws(' --botresp-- ', F.collect_list('prompt')).alias('full_interaction'),
                                                   F.concat_ws(' ', F.collect_list('clean')).alias('clean_interaction')
                                                  )

In [24]:
groupCols

['conversation_hash',
 'model',
 'timestamp',
 'turn',
 'language',
 'state',
 'country',
 'hashed_ip',
 'turn_identifier']

In [25]:
process_data.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- state: string (nullable = false)
 |-- country: string (nullable = false)
 |-- hashed_ip: string (nullable = true)
 |-- turn_identifier: long (nullable = true)
 |-- full_interaction: string (nullable = false)
 |-- clean_interaction: string (nullable = false)



In [26]:
#process_data.show()

In [27]:
#process_data.filter(F.col('conversation_hash') == '0000602c079a0926dca0a09f9817e6a9').show()

In [28]:
process_data.filter(F.col('conversation_hash') == '0000602c079a0926dca0a09f9817e6a9').limit(2).show(truncate=False)

+--------------------------------+------------------+-------------------+----+--------+-------+-------------+----------------------------------------------------------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [29]:
#Preprocess 5 :  last step
tk = Tokenizer(inputCol = 'clean_interaction', outputCol = 'tokenized_clean')
custom_stop_words = StopWordsRemover.loadDefaultStopWords("english") + ["\n", "\t", ""]
swr = StopWordsRemover(inputCol = 'tokenized_clean', outputCol = 'swr_clean_tokens', stopWords=custom_stop_words)
cv = CountVectorizer(inputCol = 'swr_clean_tokens', outputCol = 'raw_features', vocabSize=100000000, minDF =2.0)
idf = IDF(inputCol = 'raw_features', outputCol = 'tfidf_features')

In [30]:
pipeline = Pipeline(stages=[tk, swr, cv, idf])

In [31]:
pipeline_model = pipeline.fit(process_data)

24/11/23 13:30:08 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/11/23 13:32:01 WARN DAGScheduler: Broadcasting large task binary with size 17.0 MiB
24/11/23 13:32:41 WARN DAGScheduler: Broadcasting large task binary with size 17.0 MiB
                                                                                

In [32]:
processed_data = pipeline_model.transform(process_data)

In [33]:
processed_data.first()

24/11/23 13:33:18 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
                                                                                

Row(conversation_hash='00015586e531840601fc329d836f7a47', model='gpt-4-0125-preview', timestamp=datetime.datetime(2024, 3, 31, 8, 6, 27), turn=1, language='English', state='Hesse', country='Germany', hashed_ip='51716f1133f2661764ccb4a0c25135796bd3286d22d8fd0e9ece77a33e7fa874', turn_identifier=2598406, full_interaction='1_ Translate the following legal text into colloquial Farsi 2_ Place the Persian and English text side by side in the table 3_ From the beginning to the end of the text, there should be an English sentence on the left side and a Persian sentence on the right side.\n       4- Using legal language for Persian translation\n\n          .1\n\nThe Common law\nIn Anglo-Saxon times there existed three fairly distinct legal systems: The Dane Law, which had been adopted after the invasions and settlement of Danish and Scandinavian warriors in the coastal areas of northern and north- eastern England; Mercian Law, which bore traces of Germanic origin, fol- lowing the Saxon invasions

In [34]:

'''

# Filter TF-IDF features for words that appear more than 3 times
def extract_frequent_terms(tfidf_vector, vocab, threshold=2):
    # Extract indices and values
    indices = tfidf_vector.indices
    values = tfidf_vector.values
    
    # Map indices to terms and filter based on the threshold
    terms = [(vocab[i], val) for i, val in zip(indices, values) if val >= threshold]
    return terms

frequent_terms_udf = F.udf(lambda vec: extract_frequent_terms(vec, vocab), ArrayType(StructType([
    StructField("term", StringType(), True),
    StructField("tfidf", FloatType(), True)
])))

processed_data = processed_data.withColumn("frequent_terms", frequent_terms_udf(F.col("tfidf_features")))'''

'\n\n# Filter TF-IDF features for words that appear more than 3 times\ndef extract_frequent_terms(tfidf_vector, vocab, threshold=2):\n    # Extract indices and values\n    indices = tfidf_vector.indices\n    values = tfidf_vector.values\n    \n    # Map indices to terms and filter based on the threshold\n    terms = [(vocab[i], val) for i, val in zip(indices, values) if val >= threshold]\n    return terms\n\nfrequent_terms_udf = F.udf(lambda vec: extract_frequent_terms(vec, vocab), ArrayType(StructType([\n    StructField("term", StringType(), True),\n    StructField("tfidf", FloatType(), True)\n])))\n\nprocessed_data = processed_data.withColumn("frequent_terms", frequent_terms_udf(F.col("tfidf_features")))'

In [35]:
#Extract vocabulary from CountVectorizer
vocab = pipeline_model.stages[2].vocabulary

# Function to extract frequent terms from the TF-IDF vector
def extract_frequent_terms(tfidf_vector, vocab, threshold=2):
    # Convert the sparse vector to indices and values
    indices = tfidf_vector.indices.tolist()
    values = tfidf_vector.values.tolist()
    
    # Map indices to terms and filter by threshold
    terms = [vocab[i] for i, val in zip(indices, values) if val >= threshold if len(vocab[i]) > 1]
    return list(set(terms))

# Broadcast the vocabulary to avoid repeated serialization
broadcast_vocab = spark.sparkContext.broadcast(vocab)

# Define UDF with a helper function
def frequent_terms_udf(tfidf_vector):
    return extract_frequent_terms(tfidf_vector, broadcast_vocab.value)

# Register the UDF
frequent_terms_udf = F.udf(frequent_terms_udf, ArrayType(StringType()))

# Apply UDF to the DataFrame
processed_data = processed_data.withColumn(
    "frequent_terms",
    frequent_terms_udf(F.col("tfidf_features"))
)

processed_data.show()

24/11/23 13:33:59 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
[Stage 38:>                                                         (0 + 1) / 1]

+--------------------+------------------+-------------------+----+--------+--------------------+---------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   conversation_hash|             model|          timestamp|turn|language|               state|        country|           hashed_ip|turn_identifier|    full_interaction|   clean_interaction|     tokenized_clean|    swr_clean_tokens|        raw_features|      tfidf_features|      frequent_terms|
+--------------------+------------------+-------------------+----+--------+--------------------+---------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|00015586e53184060...|gpt-4-0125-preview|2024-03-31 08:06:27|   1| English|               Hesse|        Germa

                                                                                

In [36]:
processed_data.select('clean_interaction', 'frequent_terms').first()

24/11/23 13:34:37 WARN DAGScheduler: Broadcasting large task binary with size 35.9 MiB
                                                                                

Row(clean_interaction='as a prompt generator for a generative ai called midjourney , you will create image prompts for the ai to visualize . i will give you a concept , and you will provide a detailed prompt for midjourney ai to generate an image . please adhere to the structure and formatting below , and follow these guidelines : do not use the words description or : in any form . do not place a comma between ar and v . write each prompt in one line without using return . structure : 1 = 2 = a detailed description of 1 with specific imagery details . 3 = a detailed description of the scenes environment . 4 = a detailed description of the compositions . 5 = a detailed description of the scenes mood , feelings , and atmosphere . 6 = a style e . g . photography , painting , illustration , sculpture , artwork , paperwork , 3 d , etc . for 1 . 7 = a detailed description of the scenes mood , feelings , and atmosphere . ar = use ar 16 : 9 for horizontal images , ar 9 : 16 for vertical images

In [37]:
processed_data.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- state: string (nullable = false)
 |-- country: string (nullable = false)
 |-- hashed_ip: string (nullable = true)
 |-- turn_identifier: long (nullable = true)
 |-- full_interaction: string (nullable = false)
 |-- clean_interaction: string (nullable = false)
 |-- tokenized_clean: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- swr_clean_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- raw_features: vector (nullable = true)
 |-- tfidf_features: vector (nullable = true)
 |-- frequent_terms: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [38]:
processed_data = processed_data.drop('tfidf_features', 'raw_features', 'swr_clean_tokens', 'tokenized_clean')

In [39]:
processed_data.show()

24/11/23 13:35:16 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
[Stage 44:>                                                         (0 + 1) / 1]

+--------------------+------------------+-------------------+----+--------+--------------------+---------------+--------------------+---------------+--------------------+--------------------+--------------------+
|   conversation_hash|             model|          timestamp|turn|language|               state|        country|           hashed_ip|turn_identifier|    full_interaction|   clean_interaction|      frequent_terms|
+--------------------+------------------+-------------------+----+--------+--------------------+---------------+--------------------+---------------+--------------------+--------------------+--------------------+
|00015586e53184060...|gpt-4-0125-preview|2024-03-31 08:06:27|   1| English|               Hesse|        Germany|51716f1133f266176...|        2598406|1_ Translate the ...|1 translate the f...|[produce, farsi, ...|
|0005e8a06361dea95...|gpt-3.5-turbo-0301|2023-06-19 06:32:12|   7| English|         Alba County|        Romania|35207df0ab6a29f2c...|         948797

                                                                                

In [40]:
processed_data.first()

24/11/23 13:35:54 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
                                                                                

Row(conversation_hash='00015586e531840601fc329d836f7a47', model='gpt-4-0125-preview', timestamp=datetime.datetime(2024, 3, 31, 8, 6, 27), turn=1, language='English', state='Hesse', country='Germany', hashed_ip='51716f1133f2661764ccb4a0c25135796bd3286d22d8fd0e9ece77a33e7fa874', turn_identifier=2598406, full_interaction='1_ Translate the following legal text into colloquial Farsi 2_ Place the Persian and English text side by side in the table 3_ From the beginning to the end of the text, there should be an English sentence on the left side and a Persian sentence on the right side.\n       4- Using legal language for Persian translation\n\n          .1\n\nThe Common law\nIn Anglo-Saxon times there existed three fairly distinct legal systems: The Dane Law, which had been adopted after the invasions and settlement of Danish and Scandinavian warriors in the coastal areas of northern and north- eastern England; Mercian Law, which bore traces of Germanic origin, fol- lowing the Saxon invasions

In [41]:
result_data = processed_data

In [42]:
result_data.persist()

DataFrame[conversation_hash: string, model: string, timestamp: timestamp, turn: bigint, language: string, state: string, country: string, hashed_ip: string, turn_identifier: bigint, full_interaction: string, clean_interaction: string, frequent_terms: array<string>]

In [43]:
result_data.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- state: string (nullable = false)
 |-- country: string (nullable = false)
 |-- hashed_ip: string (nullable = true)
 |-- turn_identifier: long (nullable = true)
 |-- full_interaction: string (nullable = false)
 |-- clean_interaction: string (nullable = false)
 |-- frequent_terms: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [44]:
#Keyword search 
keyword = "Camera".lower()

In [45]:
results = result_data.filter(F.array_contains(F.col('frequent_terms') , keyword), )
results.count()

24/11/23 13:36:32 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
24/11/23 13:39:25 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB
                                                                                

93156

In [46]:
results.first()

24/11/23 13:39:51 WARN DAGScheduler: Broadcasting large task binary with size 36.0 MiB


Row(conversation_hash='005d4afb406688bb0fa599c093f6887d', model='gpt-3.5-turbo-0613', timestamp=datetime.datetime(2023, 9, 1, 17, 34, 58), turn=3, language='English', state='California', country='United States', hashed_ip='92e85fee18c90c90a814f4d4439359098fb83656abfb7f159d894e1e513a80e9', turn_identifier=1474194, full_interaction='Jacob Limacher was born in Luzern, Switzerland on February 20 1858.\nJacob’s Arrival to New York in 1879 . He is now 21 years old. Citizenship in 1905.\n\nMary Imfeld was born in Alpnach,Obwalden, Switzerland on May 19 1856.\nMary Arrival to New York in 1882, She is now 26 years old . Citizenship in 1905.\n\nJacob spent some time in San Francisco , but ended up in Pasadena Calif,\nas did Mary Imfeld. they meet and in 1884 they were married\n\nJacob Limacher was a Rancher and by 1900 we find them on 45 acres of land in \nthe Arroyo Saco in Pasadena Calif. (This piece of land according to the Deed was \npurchased with a ten dollar gold coin). They had built a c

#Testing

In [None]:
#classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device='mps')
#labels = ['code', 'plain text']

In [None]:
'''sc = SparkContext.getOrCreate()
classifier_broadcast = sc.broadcast((classifier, labels))

def textClassifier(prompt):
    classifier, labels = classifier_broadcast.value
    result = classifier(prompt, labels)
    return result["labels"][0]

classify_udf = F.udf(textClassifier, StringType())
process_data.withColumn('prompt-type', classify_udf(F.col('clean'))).distinct().persist().collect() '''
    

In [None]:
'''# Function to classify a batch of prompts
def classify_batch(prompts):
    results = []
    for prompt in prompts:
        try:
            result = classifier(prompt, labels)
            results.append(result['labels'][0])  # Add the top label
        except Exception as e:
            results.append("error")  # Handle any classification errors
    return results'''

In [None]:
#clean_prompts = process_data.select('clean').toPandas()

In [None]:
'''from transformers import pipeline

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device = 'mps')

# Define the input text
input_text = 'Hey there! Are you familiar with reality shifting? So, I’m refining a foolproof method for reality shifting and want to pick a destination. Want to help me? I’m thinking something pretty personalized. There are a few things that are required of my destination. 1. The quest. I have to have a clear overarching goal in my reality, and don’t make it too crazy. It should be more along the lines of “save the president’s daughter” or “escape this weird wacky sinister place” NOT “get an artifact that literally controls reality”. Seriously, don’t make me fetch an artifact, or fetch anything. Instead, make me DO something. 2. Babes. I need pretty girls. 3. The entry. I need to get to lose consciousness in order to begin my journey in my desired reality, preferably by having it knocked out by one of the aforementioned babes. 4. Action. It needs to be cool. 5. Unconsciousness. Myself and the babes need to pass out in this place, preferably by being knocked out in some way or fainting. And it should happen, like, a lot. With these requirements in mind, you got any unique refined ideas? Don’t be vague, be extremely specific. Also, make your response as long and detailed as possible. Be super specific, especially when describing the world. The world should be self-contained and relatively small/understandable. Also, try to be conversational. Describe the world well.'


# Define possible labels for classification
labels = ["code", "plain text"]

# Perform classification
result = classifier(input_text, labels)

# Print the classification result
print(result)
print("Classification:", result["labels"][0])  # The most likely label'''