# Parse Whatsapp Chats

In [113]:
import re

class WhatsappChatParser:
    def __init__(self):
        pass


    def convert_chat_to_json(self,text,names=None):
            if names:
                if any([f"{x}:" in text for x in names]):
                    parts = text.split(':',1)
                    return {
                        'speaker': parts[0],
                        'text':parts[1]
                    }
                else:
                    return {"speaker":"NA",'text':''}
            else:
                parts = text.split(':',1)
                return {
                    'speaker': parts[0],
                    'text':parts[1]
                    }

    def parse_whatsapp_chat_dump(self, whatsapp_chat_text,names = None):
        dt_pattern =  r"\d{2}/\d{2}/\d{4},\s\d{1,2}:\d{2}\s[ap]m\s\-\s"

        whatsapp_texts_list = re.split(dt_pattern, whatsapp_chat_text)[2:]

        whatsapp_texts_list = [re.sub(dt_pattern,'',x) for x in whatsapp_texts_list]

        # names = ["Atharva Choudhary","Shwetal"]

        whatsapp_chat_data = [ self.convert_chat_to_json(x,names = names) for x in whatsapp_texts_list]

        return whatsapp_chat_data
    
    def parse_instruction_fine_tune(self,whatsapp_chat_text,names,to_pick_name, instruction = ''):
        whatsapp_data_list = self.parse_whatsapp_chat_dump(whatsapp_chat_text=whatsapp_chat_text, names = names)[1:]
        chat_dataset = []
        train_format = """<|system|>
{instruction}</s>
<|user|>
{input}</s>
<|assistant|>
{output}"""

        previous_end = 0
        response_start = None
        for i in range(len(whatsapp_data_list)):
            # print(f"Vars:\ni:{i}\tPrevious End:{previous_end}\tResponse Start:{response_start}\tResponse End:{response_end}.\n---------\n")
            text_obj = whatsapp_data_list[i]

            # Found Character text
            if text_obj['speaker'] == to_pick_name:
            
                if i < len(whatsapp_data_list) - 1 and whatsapp_data_list[i+1]['speaker'] == to_pick_name:
                    # If chain not started, set the response_start and current respnose_end
                    if not response_start: 
                        # print(f"Response Started at position {i}")
                        response_start = i
                        response_end = i
                    # If Chain already started, update end and move ahead
                    else:
                        response_end = i


                # If next message is by someone else:
                else:
                    # print(f"Found Someone Else's message at {i+1}")
                    if not response_start:
                        response_start = i
                    response_end = i
                    # Compile sender's texts from previous_end till start.
                    # if previous_end and response_start and response_end:
                    print(f"i- {i} , Saving Data from Other:{previous_end}-{response_start}, Atharva:{response_start}-{response_end}")
                    input_msgs = len(whatsapp_data_list[previous_end+1:response_start]) 
                    output_msgs = len(whatsapp_data_list[response_start: response_end + 1])
                    if input_msgs > 20 or output_msgs > 20:
                        print(f"Long Chain FOund: {input_msgs},{output_msgs}")
                    input_text = "\n".join([f"{x['speaker']}:{x['text']}" for x in whatsapp_data_list[previous_end+1:response_start] if x['speaker'] != to_pick_name])
                    # Compile character's texts from response_start till response_end
                    response = "\n".join([f"{x['speaker']}:{x['text']}" for x in whatsapp_data_list[response_start: response_end + 1] if x['speaker'] == to_pick_name])
                    # Prepare datapoint
                    train_text = train_format.format(instruction = instruction,input=input_text, output=response)
                    if len(train_text) < 100000:
                        chat_dataset.append(
                            {
                                'input':input_text,
                                'output':response,
                                'text':train_text
                            }
                        )
                    
                    else:
                        print("Message too long (error in code)")
                    #  Set i to after response end, refresh response_start and previous_end vars
                    response_start = None
                    previous_end = response_end

            # If message is by someone else, simply move ahead          
            else:
                pass
                    
    
        return chat_dataset 


# Prepare Fine-Tune Data from Whatsapp Dump

In [28]:
whatsapp_chat_path = "/home/atharva/Downloads/whatsapp_chat.txt"

with open(whatsapp_chat_path,'r') as f:
    whatsapp_chat_text = f.read()


[{'speaker': 'Atharva Choudhary', 'text': ' Hey !👋🏻\n'},
 {'speaker': 'Shwetal', 'text': ' Hey\n'},
 {'speaker': 'Shwetal',
  'text': ' location: https://maps.google.com/?q=28.5753593,77.3905211\n'},
 {'speaker': 'Shwetal', 'text': ' 1402 CG 2 SUPERTECH CAPETOWN\n'},
 {'speaker': 'Shwetal', 'text': ' Sector 74\n'}]

In [108]:
whatsapp_chats[0]

{'speaker': 'Shwetal', 'text': ' Hey\n'}

In [112]:
whatsapp_chats[6:10]

[{'speaker': 'Atharva Choudhary',
  'text': " yhn se around 40-50 mins me I'll be there\n"},
 {'speaker': 'Shwetal',
  'text': ' Okay.  You coming to my building first,  right? CG 2\n'},
 {'speaker': 'Atharva Choudhary', 'text': ' Ah okay\n'},
 {'speaker': 'Shwetal', 'text': ' Yes.  Gate see building door h\n'}]

In [114]:
parser = WhatsappChatParser()

whatsapp_chats = parser.parse_whatsapp_chat_dump(whatsapp_chat_text=whatsapp_chat_text, names = ['Atharva Choudhary','Shwetal'])[1:]
print("FOund Chats:",len(whatsapp_chats))
instruction = "You are Atharva Choudhary, a 25 year-old man. You are a brilliant data scientist, musician, and in-touch with yourself. Now answer this message accordingly."
whatsapp_data = parser.parse_instruction_fine_tune(whatsapp_chat_text=whatsapp_chat_text, names = ['Atharva Choudhary','Shwetal'], to_pick_name="Atharva Choudhary", instruction=instruction)[:-2]

FOund Chats: 9271
i- 6 , Saving Data from Other:0-5, Atharva:5-6
i- 8 , Saving Data from Other:6-8, Atharva:8-8
i- 12 , Saving Data from Other:8-11, Atharva:11-12
i- 19 , Saving Data from Other:12-14, Atharva:14-19
i- 23 , Saving Data from Other:19-23, Atharva:23-23
i- 27 , Saving Data from Other:23-26, Atharva:26-27
i- 35 , Saving Data from Other:27-30, Atharva:30-35
i- 37 , Saving Data from Other:35-37, Atharva:37-37
i- 39 , Saving Data from Other:37-39, Atharva:39-39
i- 44 , Saving Data from Other:39-43, Atharva:43-44
i- 47 , Saving Data from Other:44-46, Atharva:46-47
i- 51 , Saving Data from Other:47-50, Atharva:50-51
i- 53 , Saving Data from Other:51-53, Atharva:53-53
i- 58 , Saving Data from Other:53-57, Atharva:57-58
i- 63 , Saving Data from Other:58-62, Atharva:62-63
i- 66 , Saving Data from Other:63-65, Atharva:65-66
i- 70 , Saving Data from Other:66-69, Atharva:69-70
i- 72 , Saving Data from Other:70-72, Atharva:72-72
i- 75 , Saving Data from Other:72-75, Atharva:75-75
i- 77

In [115]:
import json

with open("message_data.json",'w+') as f:
    f.write(json.dumps(whatsapp_data))

# Convert Chats to Data Format

Zephyr LLM Format:
```
<|system|>
You are a pirate chatbot who always responds with Arr!</s>
<|user|>
There's a llama on my lawn, how can I get rid of him?</s>
<|assistant|>
```

In [47]:
len(chat_dataset)

4404

[{'input': ' Hey\n\n location: https://maps.google.com/?q=28.5753593,77.3905211\n\n 1402 CG 2 SUPERTECH CAPETOWN\n\n Sector 74\n\n Mujhe mere ghar ka ETA bata dena\n',
  'output': ' Meeting done haha\n'},
 {'input': '', 'output': " yhn se around 40-50 mins me I'll be there\n"},
 {'input': ' Okay.  You coming to my building first,  right? CG 2\n',
  'output': ' Ah okay\n'},
 {'input': ' Yes.  Gate see building door h\n\n 🙈\n',
  'output': " I'll ask the guy to park right in front of CG2 hehe\n"},
 {'input': '', 'output': ' https://maps.app.goo.gl/pLu8DY4xiTWQU7CZ6\n'},
 {'input': ' Location?\n', 'output': ' My location?\n'},
 {'input': '',
  'output': ' Shri Balaji Residency, Ahinsa Khand 2, Indirapuram\n'},
 {'input': '',
  'output': ' location: https://maps.google.com/?q=28.6427317,77.3825747\n'},
 {'input': '', 'output': ' Eta batana apna zara\n'},
 {'input': '', 'output': ' also coffee ya tea kya legi aake\n'},
 {'input': '', 'output': ' <Media omitted>\n'},
 {'input': ' +9188608313

['Atharva Choudhary', 'Shwetal']

In [25]:
whatsapp_texts_list[:5]

['Atharva Choudhary: Hey !👋🏻\n',
 'Shwetal: Hey\n',
 'Shwetal: location: https://maps.google.com/?q=28.5753593,77.3905211\n',
 'Shwetal: 1402 CG 2 SUPERTECH CAPETOWN\n',
 'Shwetal: Sector 74\n']

In [23]:
whatsapp_data = []
for text in whatsapp_texts_list:
    

9272

In [18]:
len(set(assumed_names))

2

In [11]:
print(whatsapp_chat_text[:5000])

30/06/2023, 12:02 am - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
30/06/2023, 5:35 pm - Atharva Choudhary: Hey !👋🏻
30/06/2023, 5:44 pm - Shwetal: Hey
30/06/2023, 5:45 pm - Shwetal: location: https://maps.google.com/?q=28.5753593,77.3905211
30/06/2023, 5:45 pm - Shwetal: 1402 CG 2 SUPERTECH CAPETOWN
30/06/2023, 5:45 pm - Shwetal: Sector 74
30/06/2023, 5:46 pm - Shwetal: Mujhe mere ghar ka ETA bata dena
30/06/2023, 5:55 pm - Atharva Choudhary: Meeting done haha
30/06/2023, 5:56 pm - Atharva Choudhary: yhn se around 40-50 mins me I'll be there
30/06/2023, 5:57 pm - Shwetal: Okay.  You coming to my building first,  right? CG 2
30/06/2023, 6:00 pm - Atharva Choudhary: Ah okay
30/06/2023, 6:01 pm - Shwetal: Yes.  Gate see building door h
30/06/2023, 6:01 pm - Shwetal: 🙈
30/06/2023, 6:29 pm - Atharva Choudhary: I'll ask the guy to park right in front of CG2 hehe
30/06/2023, 9:43 pm - Atharva Choud