# Parse Whatsapp Chats

In [63]:
import re

class WhatsappChatParser:
    def __init__(self):
        pass


    def convert_chat_to_json(self,text,names=None):
            if names:
                if any([f"{x}:" in text for x in names]):
                    parts = text.split(':',1)
                    return {
                        'speaker': parts[0],
                        'text':parts[1]
                    }
                else:
                    return {"speaker":"NA",'text':''}
            else:
                parts = text.split(':',1)
                return {
                    'speaker': parts[0],
                    'text':parts[1]
                    }

    def parse_whatsapp_chat_dump(self, whatsapp_chat_text,names = None):
        dt_pattern =  r"\d{2}/\d{2}/\d{4},\s\d{1,2}:\d{2}\s[ap]m\s\-\s"

        whatsapp_texts_list = re.split(dt_pattern, whatsapp_chat_text)[2:]

        whatsapp_texts_list = [re.sub(dt_pattern,'',x) for x in whatsapp_texts_list]

        # names = ["Atharva Choudhary","Shwetal"]

        whatsapp_chat_data = [ self.convert_chat_to_json(x,names = names) for x in whatsapp_texts_list]

        return whatsapp_chat_data
    
    def parse_instruction_fine_tune(self,whatsapp_chat_text,names,to_pick_name, instruction = ''):
        whatsapp_data = self.parse_whatsapp_chat_dump(whatsapp_chat_text=whatsapp_chat_text, names = names)
        last_position = -1
        chat_dataset = []
        train_format = """<|system|>
{instruction}</s>
<|user|>
{input}</s>
<|assistant|>
{output}"""
        for i,text_obj in enumerate(whatsapp_data):
            if text_obj['speaker'] == to_pick_name:
            
                if last_position == -1:
                    last_position = i
                    pass
                else:
                    input_text = f"{whatsapp_data[i-1]['speaker']}:" + "\n".join([x['text'] for x in whatsapp_data[last_position+1:i] if x['speaker'] != to_pick_name])
                    response = text_obj['text']
                    
                    last_position = i
                    
                    train_text = train_format.format(instruction = instruction,input=input_text, output=response)

                    chat_dataset.append(
                        {
                            'input':input_text,
                            'output':response,
                            'text':train_text
                        }
                    )
        
        
        return chat_dataset 


# Prepare Fine-Tune Data from Whatsapp Dump

In [58]:
whatsapp_chat_path = "/home/atharva/Downloads/whatsapp_chat.txt"

with open(whatsapp_chat_path,'r') as f:
    whatsapp_chat_text = f.read()


In [67]:
parser = WhatsappChatParser()


instruction = "You are Atharva Choudhary, a 25 year-old man. You are a brilliant data scientist, musician, and in-touch with yourself. Now answer this message accordingly."
whatsapp_data = parser.parse_instruction_fine_tune(whatsapp_chat_text=whatsapp_chat_text, names = ['Atharva Choudhary','Shwetal'], to_pick_name="Atharva Choudhary", instruction=instruction)

In [68]:
whatsapp_data[:3]

[{'input': 'Shwetal: Hey\n\n location: https://maps.google.com/?q=28.5753593,77.3905211\n\n 1402 CG 2 SUPERTECH CAPETOWN\n\n Sector 74\n\n Mujhe mere ghar ka ETA bata dena\n',
  'output': ' Meeting done haha\n',
  'text': '<|system|>\nYou are Atharva Choudhary, a 25 year-old man. You are a brilliant data scientist, musician, and in-touch with yourself. Now answer this message accordingly.</s>\n<|user|>\nShwetal: Hey\n\n location: https://maps.google.com/?q=28.5753593,77.3905211\n\n 1402 CG 2 SUPERTECH CAPETOWN\n\n Sector 74\n\n Mujhe mere ghar ka ETA bata dena\n</s>\n<|assistant|>\n Meeting done haha\n'},
 {'input': 'Atharva Choudhary:',
  'output': " yhn se around 40-50 mins me I'll be there\n",
  'text': "<|system|>\nYou are Atharva Choudhary, a 25 year-old man. You are a brilliant data scientist, musician, and in-touch with yourself. Now answer this message accordingly.</s>\n<|user|>\nAtharva Choudhary:</s>\n<|assistant|>\n yhn se around 40-50 mins me I'll be there\n"},
 {'input': '

# Convert Chats to Data Format

Zephyr LLM Format:
```
<|system|>
You are a pirate chatbot who always responds with Arr!</s>
<|user|>
There's a llama on my lawn, how can I get rid of him?</s>
<|assistant|>
```

In [47]:
len(chat_dataset)

4404

[{'input': ' Hey\n\n location: https://maps.google.com/?q=28.5753593,77.3905211\n\n 1402 CG 2 SUPERTECH CAPETOWN\n\n Sector 74\n\n Mujhe mere ghar ka ETA bata dena\n',
  'output': ' Meeting done haha\n'},
 {'input': '', 'output': " yhn se around 40-50 mins me I'll be there\n"},
 {'input': ' Okay.  You coming to my building first,  right? CG 2\n',
  'output': ' Ah okay\n'},
 {'input': ' Yes.  Gate see building door h\n\n 🙈\n',
  'output': " I'll ask the guy to park right in front of CG2 hehe\n"},
 {'input': '', 'output': ' https://maps.app.goo.gl/pLu8DY4xiTWQU7CZ6\n'},
 {'input': ' Location?\n', 'output': ' My location?\n'},
 {'input': '',
  'output': ' Shri Balaji Residency, Ahinsa Khand 2, Indirapuram\n'},
 {'input': '',
  'output': ' location: https://maps.google.com/?q=28.6427317,77.3825747\n'},
 {'input': '', 'output': ' Eta batana apna zara\n'},
 {'input': '', 'output': ' also coffee ya tea kya legi aake\n'},
 {'input': '', 'output': ' <Media omitted>\n'},
 {'input': ' +9188608313

['Atharva Choudhary', 'Shwetal']

In [25]:
whatsapp_texts_list[:5]

['Atharva Choudhary: Hey !👋🏻\n',
 'Shwetal: Hey\n',
 'Shwetal: location: https://maps.google.com/?q=28.5753593,77.3905211\n',
 'Shwetal: 1402 CG 2 SUPERTECH CAPETOWN\n',
 'Shwetal: Sector 74\n']

In [23]:
whatsapp_data = []
for text in whatsapp_texts_list:
    

9272

In [18]:
len(set(assumed_names))

2

In [11]:
print(whatsapp_chat_text[:5000])

30/06/2023, 12:02 am - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
30/06/2023, 5:35 pm - Atharva Choudhary: Hey !👋🏻
30/06/2023, 5:44 pm - Shwetal: Hey
30/06/2023, 5:45 pm - Shwetal: location: https://maps.google.com/?q=28.5753593,77.3905211
30/06/2023, 5:45 pm - Shwetal: 1402 CG 2 SUPERTECH CAPETOWN
30/06/2023, 5:45 pm - Shwetal: Sector 74
30/06/2023, 5:46 pm - Shwetal: Mujhe mere ghar ka ETA bata dena
30/06/2023, 5:55 pm - Atharva Choudhary: Meeting done haha
30/06/2023, 5:56 pm - Atharva Choudhary: yhn se around 40-50 mins me I'll be there
30/06/2023, 5:57 pm - Shwetal: Okay.  You coming to my building first,  right? CG 2
30/06/2023, 6:00 pm - Atharva Choudhary: Ah okay
30/06/2023, 6:01 pm - Shwetal: Yes.  Gate see building door h
30/06/2023, 6:01 pm - Shwetal: 🙈
30/06/2023, 6:29 pm - Atharva Choudhary: I'll ask the guy to park right in front of CG2 hehe
30/06/2023, 9:43 pm - Atharva Choud