# 1M WILDCHAT DATASET PREPROCESSING :
This jupyter notebook details the preprocessing of 1M WILDCHAT Dataset to suit the requirements of the project. The pre-precossing is performed using PySpark.

In [1]:
import findspark
findspark.init()
findspark.find()

'/opt/anaconda3/lib/python3.12/site-packages/pyspark'

In [3]:
#importing all essentail libraries for the processing 
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.context import SparkContext
from pyspark.sql.types import ArrayType, StringType, BooleanType

In [5]:
spark = SparkSession.builder.appName('WildCHAT1M').config("spark.driver.memory", "24g").config("spark.executor.memory", "8g").config("spark.sql.debug.maxToStringFields", 1000).config("spark.executor.extraJavaOptions", "-XX:+UseG1GC").config("spark.executor.cores", "4").getOrCreate()

#Added more driver memory to help with chat aggregations and increased the Sql String Length to accomodate the larger chat and response strings.

24/11/10 00:50:02 WARN Utils: Your hostname, Sharans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.30 instead (on interface en0)
24/11/10 00:50:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/10 00:50:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
spark

In [9]:
spark_df = spark.read.parquet("/Users/sharan/Desktop/IDMP Data/*.parquet")

In [11]:
#Displaying the first 5 interactions from the dataset.
spark_df.limit(5).show()

+--------------------+----------+-------------------+--------------------+----+--------+--------------------+--------------------+-----+--------+------------+-------------+--------------------+--------------------+
|   conversation_hash|     model|          timestamp|        conversation|turn|language|   openai_moderation| detoxify_moderation|toxic|redacted|       state|      country|           hashed_ip|              header|
+--------------------+----------+-------------------+--------------------+----+--------+--------------------+--------------------+-----+--------+------------+-------------+--------------------+--------------------+
|698e02bae74e1ca4e...|gpt-4-0314|2023-04-08 20:01:06|[{POUVEZ VOUS ME ...|   1|  French|[{{false, false, ...|[{0.0120244864374...| true|   false|       Dakar|      Senegal|cc4eb1e4234c16afc...|{fr,fr-FR;q=0.9,e...|
|c9ec5b440fbdd2a26...|gpt-4-0314|2023-04-08 20:02:53|[{Hey there! Are ...|   1| English|[{{false, false, ...|[{2.0589135237969...|false|   f

In [13]:
spark_df = spark_df.filter(F.col('redacted') == False).filter(F.col('toxic') == False).filter(F.col('language') == 'English')
#Saving records that are of English Language, do not contain any redacted information and are non - toxic.

In [15]:
spark_df.count()

473265

In [17]:
spark_df.select('language').distinct().collect()  #Confirming only English language interactions are available 

[Row(language='English')]

In [19]:
spark_df.select('redacted').distinct().collect() #Confirming only non - redacted interactions are available 

[Row(redacted=False)]

In [21]:
spark_df.select('toxic').distinct().collect()   #Confirming only non - toxic interactions are available 

[Row(toxic=False)]

In [23]:
spark_df.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- openai_moderation: array (nu

In [25]:
#Dropping columns openai_moderation, detoxify_moderation, and header for cleaner schema.
spark_df = spark_df.drop("openai_moderation", "detoxify_moderation", "header")

In [27]:
spark_df.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- toxic: boolean (nullable = t

In [29]:
spark_df.select('conversation').printSchema()

root
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)



Schema for Conversation Field is made up of an array of structures, where each structure contains each user - bot interaction. The individual details of each interaction is printed in the schema above: 
 * <b> content </b> contains the prompt / response
 * <b> role </b> specifies if the content present is generated from the user or by the bot
 * <b> turn_identifier </b> specifies an identifying value to isolate a specific chat interaction.

Exploding the conversation field to seperate the user prompts, bot responses.

In [31]:
main_df = spark_df

In [33]:
#Isolating just the content and turn_identifier from each structure.
spark_df = spark_df.withColumn('exploded_conversation', F.explode(F.col('conversation'))).withColumn("content", F.col("exploded_conversation.content")).withColumn("turn_identifier",F.col("exploded_conversation.turn_identifier"))


In [35]:
spark_df.filter(F.col('turn') == 2).limit(4).show()

+--------------------+----------+-------------------+--------------------+----+--------+-----+--------+--------+-----------+--------------------+---------------------+--------------------+---------------+
|   conversation_hash|     model|          timestamp|        conversation|turn|language|toxic|redacted|   state|    country|           hashed_ip|exploded_conversation|             content|turn_identifier|
+--------------------+----------+-------------------+--------------------+----+--------+-----+--------+--------+-----------+--------------------+---------------------+--------------------+---------------+
|bac147235b8a766c1...|gpt-4-0314|2023-04-08 20:36:45|[{Write a ‘Simple...|   2| English|false|   false|Auckland|New Zealand|0ef4c4d5ec3b79928...| {Write a ‘Simple ...|Write a ‘Simple m...|         101104|
|bac147235b8a766c1...|gpt-4-0314|2023-04-08 20:36:45|[{Write a ‘Simple...|   2| English|false|   false|Auckland|New Zealand|0ef4c4d5ec3b79928...| {Arya, I've been ...|Arya, I've be

In [37]:
spark_df = spark_df.drop("exploded_conversation")

In [39]:
#Combining prompt and response based on turn_idenfier
#Recording the list of columns except for prompt to group rows based on other field values
group_cols = [col for col in spark_df.columns if col != 'content']
group_cols

['conversation_hash',
 'model',
 'timestamp',
 'conversation',
 'turn',
 'language',
 'toxic',
 'redacted',
 'state',
 'country',
 'hashed_ip',
 'turn_identifier']

In [41]:
spark_df = spark_df.groupBy(group_cols).agg(F.collect_list('content').alias('content'))

In [43]:
group_cols.remove('turn_identifier')
group_cols

['conversation_hash',
 'model',
 'timestamp',
 'conversation',
 'turn',
 'language',
 'toxic',
 'redacted',
 'state',
 'country',
 'hashed_ip']

In [45]:
pip install langid

Note: you may need to restart the kernel to use updated packages.


In [47]:
import langid

In [None]:
# To save content with just english texts and not anything else check using 
# import langid 
# langid.classify(text)
# later apply spark filter to filter out interactions that do not belong to English language.

In [49]:
@F.udf(BooleanType())
def languageCheck(list):
    if list:
        lang, cnfLvl =  langid.classify(list[0])
        return lang == 'en'
    return False

In [51]:
spark_df.persist()

DataFrame[conversation_hash: string, model: string, timestamp: timestamp, conversation: array<struct<content:string,country:string,hashed_ip:string,header:struct<accept-language:string,user-agent:string>,language:string,redacted:boolean,role:string,state:string,timestamp:timestamp,toxic:boolean,turn_identifier:bigint>>, turn: bigint, language: string, toxic: boolean, redacted: boolean, state: string, country: string, hashed_ip: string, turn_identifier: bigint, content: array<string>]

In [53]:
spark_df.filter(languageCheck(F.col('content')) == True).count()

24/11/10 01:02:50 WARN MemoryStore: Not enough space to cache rdd_46_128 in memory! (computed 68.3 MiB so far)
24/11/10 01:02:50 WARN BlockManager: Persisting block rdd_46_128 to disk instead.
24/11/10 01:02:51 WARN MemoryStore: Not enough space to cache rdd_46_137 in memory! (computed 68.4 MiB so far)
24/11/10 01:02:54 WARN BlockManager: Persisting block rdd_46_137 to disk instead.
24/11/10 01:02:54 WARN MemoryStore: Not enough space to cache rdd_46_138 in memory! (computed 68.3 MiB so far)
24/11/10 01:02:54 WARN BlockManager: Persisting block rdd_46_138 to disk instead.
24/11/10 01:02:55 WARN MemoryStore: Not enough space to cache rdd_46_143 in memory! (computed 68.7 MiB so far)
24/11/10 01:02:55 WARN BlockManager: Persisting block rdd_46_143 to disk instead.
24/11/10 01:02:55 WARN MemoryStore: Not enough space to cache rdd_46_141 in memory! (computed 68.4 MiB so far)
24/11/10 01:02:55 WARN BlockManager: Persisting block rdd_46_141 to disk instead.
24/11/10 01:02:55 WARN MemoryStore:

924611

In [55]:
spark_df = spark_df.filter(languageCheck(F.col('content')) == True)

In [57]:
#Combining interactions from single user
#Removing turn_identifier from group_cols to combine user interactions in prompt field and save turn identifiers.
spark_df = spark_df.groupBy(group_cols).agg(F.collect_list('content').alias('content'), F.collect_list('turn_identifier').alias('turn_identifier'))


In [59]:
#Defining user-defined Function to isolate user prompt
@F.udf(ArrayType(StringType()))
def get_user_prompt(list):
    if list:
        return [prompt for turn,conversation in enumerate(list) for interaction_turn, prompt in enumerate(conversation) if interaction_turn == 0]
    return []

In [61]:
#Defining user-defined Function to isolate bot response
@F.udf(ArrayType(StringType()))
def get_bot_response(list):
    if list:
        return [prompt for turn,conversation in enumerate(list) for interaction_turn, prompt in enumerate(conversation) if interaction_turn == 1]
    return []

In [63]:
spark_df = spark_df.withColumn('userprompt', get_user_prompt(F.col("content"))).withColumn('botresp', get_bot_response(F.col("content")))

In [65]:
spark_df.show()

24/11/10 01:11:34 WARN MemoryStore: Not enough space to cache rdd_46_155 in memory! (computed 68.4 MiB so far)
24/11/10 01:11:46 WARN MemoryStore: Not enough space to cache rdd_46_166 in memory! (computed 68.2 MiB so far)
24/11/10 01:12:04 WARN MemoryStore: Not enough space to cache rdd_46_185 in memory! (computed 68.2 MiB so far)

+--------------------+------------------+-------------------+--------------------+----+--------+-----+--------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   conversation_hash|             model|          timestamp|        conversation|turn|language|toxic|redacted|               state|       country|           hashed_ip|             content|     turn_identifier|          userprompt|             botresp|
+--------------------+------------------+-------------------+--------------------+----+--------+-----+--------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|00235f623a3616d09...|gpt-3.5-turbo-0301|2023-06-20 19:33:18|[{preted you are ...|   1| English|false|   false|County of Osijek-...|       Croatia|b2a98e8c1f6355bb7...|[[preted you are ...|            [973016]|[preted you are m...|[I'm sorry

                                                                                

In [67]:
spark_df.count()

24/11/10 01:14:21 WARN MemoryStore: Not enough space to cache rdd_46_57 in memory! (computed 68.5 MiB so far)
24/11/10 01:14:27 WARN MemoryStore: Not enough space to cache rdd_46_64 in memory! (computed 68.6 MiB so far)
24/11/10 01:15:08 WARN MemoryStore: Not enough space to cache rdd_46_102 in memory! (computed 68.1 MiB so far)
24/11/10 01:15:23 WARN MemoryStore: Not enough space to cache rdd_46_115 in memory! (computed 68.2 MiB so far)
24/11/10 01:15:29 WARN MemoryStore: Not enough space to cache rdd_46_121 in memory! (computed 68.3 MiB so far)
24/11/10 01:15:35 WARN MemoryStore: Not enough space to cache rdd_46_127 in memory! (computed 68.1 MiB so far)
24/11/10 01:15:47 WARN MemoryStore: Not enough space to cache rdd_46_135 in memory! (computed 68.2 MiB so far)
24/11/10 01:15:55 WARN MemoryStore: Not enough space to cache rdd_46_141 in memory! (computed 68.4 MiB so far)
24/11/10 01:16:26 WARN MemoryStore: Not enough space to cache rdd_46_167 in memory! (computed 68.2 MiB so far)
24/

454931

In [69]:
spark_df.printSchema()

root
 |-- conversation_hash: string (nullable = true)
 |-- model: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- conversation: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- content: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- hashed_ip: string (nullable = true)
 |    |    |-- header: struct (nullable = true)
 |    |    |    |-- accept-language: string (nullable = true)
 |    |    |    |-- user-agent: string (nullable = true)
 |    |    |-- language: string (nullable = true)
 |    |    |-- redacted: boolean (nullable = true)
 |    |    |-- role: string (nullable = true)
 |    |    |-- state: string (nullable = true)
 |    |    |-- timestamp: timestamp (nullable = true)
 |    |    |-- toxic: boolean (nullable = true)
 |    |    |-- turn_identifier: long (nullable = true)
 |-- turn: long (nullable = true)
 |-- language: string (nullable = true)
 |-- toxic: boolean (nullable = t

In [71]:
#There are about 454931 records where all user prompts are majorly in English.

In [73]:
spark_df.first()

24/11/10 01:27:45 WARN MemoryStore: Not enough space to cache rdd_46_34 in memory! (computed 68.2 MiB so far)
24/11/10 01:27:45 WARN MemoryStore: Not enough space to cache rdd_46_35 in memory! (computed 68.2 MiB so far)
24/11/10 01:28:00 WARN MemoryStore: Not enough space to cache rdd_46_46 in memory! (computed 68.1 MiB so far)
24/11/10 01:28:15 WARN MemoryStore: Not enough space to cache rdd_46_61 in memory! (computed 68.9 MiB so far)
24/11/10 01:28:28 WARN MemoryStore: Not enough space to cache rdd_46_71 in memory! (computed 68.3 MiB so far)
24/11/10 01:29:56 WARN MemoryStore: Not enough space to cache rdd_46_141 in memory! (computed 68.4 MiB so far)
                                                                                

Row(conversation_hash='00235f623a3616d09f1c240b717659d5', model='gpt-3.5-turbo-0301', timestamp=datetime.datetime(2023, 6, 20, 19, 33, 18), conversation=[Row(content='preted you are my gf. female 23', country='Croatia', hashed_ip='b2a98e8c1f6355bb73555354e4210fc50f573d0ed7e8dee40195894890d47699', header=Row(accept-language='en-US,en;q=0.9', user-agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'), language='English', redacted=False, role='user', state='County of Osijek-Baranja', timestamp=None, toxic=False, turn_identifier=973016), Row(content="I'm sorry, I cannot pretend to be someone else as I am an artificial intelligence language model. However, I am here to assist you with any language-related tasks or inquiries you may have. Let me know how I can be of help.", country=None, hashed_ip=None, header=None, language='English', redacted=False, role='assistant', state=None, timestamp=datetime.datetime(2023, 6, 20, 19, 33, 18), t

In [75]:
#Defining a userdefined function to check if userprompt is only in english
@F.udf(StringType())
def userPromptCheck(list):
    if list:
          ln = "en"
          for text in list:
              lang, conflvl = langid.classify(text)
              if lang != "en":
                  ln = lang
          return ln
    return "na"

In [77]:
spark_df.withColumn('promptLanguage', userPromptCheck(F.col('userprompt'))).select('promptLanguage').distinct().collect()

24/11/10 01:51:24 WARN MemoryStore: Not enough space to cache rdd_46_81 in memory! (computed 68.6 MiB so far)
24/11/10 01:51:43 WARN MemoryStore: Not enough space to cache rdd_46_96 in memory! (computed 68.6 MiB so far)
24/11/10 01:52:16 WARN MemoryStore: Not enough space to cache rdd_46_125 in memory! (computed 68.4 MiB so far)
24/11/10 01:52:51 WARN MemoryStore: Not enough space to cache rdd_46_155 in memory! (computed 68.4 MiB so far)
                                                                                

[Row(promptLanguage='en')]

In [83]:
#There are just english prompts in the dataset and they are about 454,931 records.

In [81]:
#Now that the dataset is updated to match the requirements, the user prompts and bot responses will be processed further to reduce their 
#length and feed them as input to NLP models.

In [159]:
import nltk
from nltk.corpus import stopwords

In [161]:
from nltk.corpus import stopwords # importing to remove all stop words that are not significant from user prompt and bot responses 
import re   # Import regular expressions 

In [167]:
stop_words  = stopwords.words('english')

@F.udf(ArrayType(StringType()))
def preprocess(content):
    if content: 
        updated_content = []
        for text in content :
            text = text.lower() #setting entire text to lower case
            text = re.sub(r'\s{2}+', '', text).strip() #removing white spaces
            text = re.sub(r'[^A-Za-z0-9\s$#@?.]', '', text) #removing any other speacial characters other than ones in the square bracket
            words = text.split()
            #print(words , "\n")
            filtered_words = [word for word in words if len(word) > 3 or (len(word) <= 3 and word not in stop_words)]
            #print(filtered_words)
            text = " ".join(filtered_words)
            updated_content.append(text)
        return updated_content
    return []
            
            

In [165]:
content = ["I     want to visit the city!     #excited", "This is a short test."]
processed_content = preprocess(content)
print(processed_content)

['want visit city #excited', 'this short test.']


In [175]:
spark_df.withColumn('userprompt_up', preprocess(F.col('userprompt'))).withColumn('botresp_up', preprocess(F.col('botresp'))).first()

24/11/10 03:03:24 WARN MemoryStore: Not enough space to cache rdd_46_34 in memory! (computed 68.2 MiB so far)
24/11/10 03:03:37 WARN MemoryStore: Not enough space to cache rdd_46_44 in memory! (computed 68.3 MiB so far)
24/11/10 03:03:39 WARN MemoryStore: Not enough space to cache rdd_46_47 in memory! (computed 68.8 MiB so far)
24/11/10 03:04:17 WARN MemoryStore: Not enough space to cache rdd_46_81 in memory! (computed 68.6 MiB so far)
24/11/10 03:04:45 WARN MemoryStore: Not enough space to cache rdd_46_103 in memory! (computed 68.4 MiB so far)
24/11/10 03:04:45 WARN MemoryStore: Not enough space to cache rdd_46_104 in memory! (computed 68.5 MiB so far)
24/11/10 03:05:56 WARN MemoryStore: Not enough space to cache rdd_46_167 in memory! (computed 68.2 MiB so far)
24/11/10 03:06:04 WARN MemoryStore: Not enough space to cache rdd_46_176 in memory! (computed 68.5 MiB so far)
24/11/10 03:06:17 WARN MemoryStore: Not enough space to cache rdd_46_187 in memory! (computed 68.1 MiB so far)
     

Row(conversation_hash='00235f623a3616d09f1c240b717659d5', model='gpt-3.5-turbo-0301', timestamp=datetime.datetime(2023, 6, 20, 19, 33, 18), conversation=[Row(content='preted you are my gf. female 23', country='Croatia', hashed_ip='b2a98e8c1f6355bb73555354e4210fc50f573d0ed7e8dee40195894890d47699', header=Row(accept-language='en-US,en;q=0.9', user-agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'), language='English', redacted=False, role='user', state='County of Osijek-Baranja', timestamp=None, toxic=False, turn_identifier=973016), Row(content="I'm sorry, I cannot pretend to be someone else as I am an artificial intelligence language model. However, I am here to assist you with any language-related tasks or inquiries you may have. Let me know how I can be of help.", country=None, hashed_ip=None, header=None, language='English', redacted=False, role='assistant', state=None, timestamp=datetime.datetime(2023, 6, 20, 19, 33, 18), t

In [177]:
spark_df = spark_df.withColumn('userprompt_up', preprocess(F.col('userprompt'))).withColumn('botresp_up', preprocess(F.col('botresp')))

In [179]:
spark_df.show()

24/11/10 03:09:54 WARN MemoryStore: Not enough space to cache rdd_46_52 in memory! (computed 68.2 MiB so far)
24/11/10 03:10:21 WARN MemoryStore: Not enough space to cache rdd_46_73 in memory! (computed 68.9 MiB so far)
24/11/10 03:10:45 WARN MemoryStore: Not enough space to cache rdd_46_93 in memory! (computed 68.2 MiB so far)
24/11/10 03:11:10 WARN MemoryStore: Not enough space to cache rdd_46_115 in memory! (computed 68.2 MiB so far)
24/11/10 03:12:26 WARN MemoryStore: Not enough space to cache rdd_46_179 in memory! (computed 68.2 MiB so far)
24/11/10 03:12:37 WARN MemoryStore: Not enough space to cache rdd_46_190 in memory! (computed 68.1 MiB so far)

+--------------------+------------------+-------------------+--------------------+----+--------+-----+--------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   conversation_hash|             model|          timestamp|        conversation|turn|language|toxic|redacted|               state|       country|           hashed_ip|             content|     turn_identifier|          userprompt|             botresp|       userprompt_up|          botresp_up|
+--------------------+------------------+-------------------+--------------------+----+--------+-----+--------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|00235f623a3616d09...|gpt-3.5-turbo-0301|2023-06-20 19:33:18|[{preted you are ...|   1| English|false|   false|Coun

                                                                                

In [181]:
spark_df.count()

24/11/10 03:13:32 WARN MemoryStore: Not enough space to cache rdd_46_28 in memory! (computed 68.1 MiB so far)
24/11/10 03:13:51 WARN MemoryStore: Not enough space to cache rdd_46_44 in memory! (computed 68.3 MiB so far)
24/11/10 03:13:54 WARN MemoryStore: Not enough space to cache rdd_46_49 in memory! (computed 68.4 MiB so far)
24/11/10 03:14:20 WARN MemoryStore: Not enough space to cache rdd_46_66 in memory! (computed 68.9 MiB so far)
24/11/10 03:14:20 WARN MemoryStore: Not enough space to cache rdd_46_67 in memory! (computed 68.3 MiB so far)
24/11/10 03:14:49 WARN MemoryStore: Not enough space to cache rdd_46_93 in memory! (computed 68.2 MiB so far)
24/11/10 03:15:23 WARN MemoryStore: Not enough space to cache rdd_46_121 in memory! (computed 68.3 MiB so far)
24/11/10 03:15:24 WARN MemoryStore: Not enough space to cache rdd_46_123 in memory! (computed 68.5 MiB so far)
24/11/10 03:15:39 WARN MemoryStore: Not enough space to cache rdd_46_138 in memory! (computed 68.3 MiB so far)
24/11/1

454931

In [183]:
output_directory = "/Users/sharan/Desktop/EnglishChats1"

In [185]:
spark_df.coalesce(25).write.mode('append').parquet(output_directory)

24/11/10 03:17:46 WARN MemoryStore: Not enough space to cache rdd_46_25 in memory! (computed 68.3 MiB so far)
24/11/10 03:17:58 WARN MemoryStore: Not enough space to cache rdd_46_34 in memory! (computed 68.2 MiB so far)
24/11/10 03:18:53 WARN MemoryStore: Not enough space to cache rdd_46_72 in memory! (computed 68.2 MiB so far)
24/11/10 03:18:53 WARN MemoryStore: Not enough space to cache rdd_46_73 in memory! (computed 68.9 MiB so far)
24/11/10 03:18:57 WARN MemoryStore: Not enough space to cache rdd_46_75 in memory! (computed 68.7 MiB so far)
24/11/10 03:19:10 WARN MemoryStore: Not enough space to cache rdd_46_86 in memory! (computed 69.3 MiB so far)
24/11/10 03:19:23 WARN MemoryStore: Not enough space to cache rdd_46_97 in memory! (computed 68.5 MiB so far)
24/11/10 03:19:57 WARN MemoryStore: Not enough space to cache rdd_46_121 in memory! (computed 68.3 MiB so far)
24/11/10 03:19:59 WARN MemoryStore: Not enough space to cache rdd_46_122 in memory! (computed 68.1 MiB so far)
24/11/10

In [189]:
spark_df.select('userprompt_up').limit(2).show(truncate=False)

24/11/10 03:34:47 WARN MemoryStore: Not enough space to cache rdd_46_157 in memory! (computed 68.4 MiB so far)
24/11/10 03:34:58 WARN MemoryStore: Not enough space to cache rdd_46_164 in memory! (computed 68.6 MiB so far)
24/11/10 03:35:10 WARN MemoryStore: Not enough space to cache rdd_46_174 in memory! (computed 68.6 MiB so far)
24/11/10 03:35:32 WARN MemoryStore: Not enough space to cache rdd_46_191 in memory! (computed 68.3 MiB so far)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userprompt_up                                                                    

                                                                                

In [187]:
#This completes Pre-Processing of the Dataset