In [12]:
import re
import pandas as pd
from datetime import datetime
from zoneinfo import ZoneInfo


In [13]:
# Step 1: Load WhatsApp chat text file
with open("thane_board_gamers_chat.txt", "r", encoding="utf-8") as file:
    chat_text = file.read()

In [14]:
# Step 2: Regex pattern (handles AM/PM with optional narrow no-break space)
pattern = r'^\[(\d{2}/\d{2}/\d{2}), (\d{1,2}:\d{2}:\d{2})\s?\u202f?(AM|PM)\] (.*?): (.*)'

In [15]:
# Step 3: System keywords to filter
system_keywords = [
    "created this group", "added", "removed", "changed the subject",
    "changed this group's icon", "joined using this group's invite link",
    "Messages and calls are end-to-end encrypted"
]

def is_system_message(text):
    return any(keyword.lower() in text.lower() for keyword in system_keywords)

In [16]:
# Step 4: Extract messages
messages = []

for match in re.finditer(pattern, chat_text, re.MULTILINE):
    date_str, time_str, am_pm, sender, message = match.groups()

    if is_system_message(message):
        continue

    # Combine and parse datetime in IST
    dt_str = f"{date_str} {time_str} {am_pm}"
    try:
        dt_obj = datetime.strptime(dt_str, "%y/%m/%d %I:%M:%S %p")
        dt_obj = dt_obj.replace(tzinfo=ZoneInfo("Asia/Kolkata"))  # add IST timezone
        dt_utc = dt_obj.astimezone(ZoneInfo("UTC"))  # convert to UTC
    except Exception as e:
        print(f"‚ùå Failed to parse datetime: {dt_str} | Error: {e}")
        continue

    messages.append({
        "datetime_ist": dt_obj.isoformat(),
        "datetime_utc": dt_utc.isoformat(),
        "sender": sender,
        "message": message
    })

In [17]:
# Step 5: Create DataFrame
df = pd.DataFrame(messages)

In [None]:
# Step 6: Save to CSV
df.to_csv("whatsapp_messages_normalized.csv", index=False, encoding="utf-8-sig")

In [19]:
# Step 7: Preview
print(df.head())

                datetime_ist               datetime_utc  \
0  2021-02-25T09:17:17+05:30  2021-02-25T03:47:17+00:00   
1  2021-02-25T20:54:40+05:30  2021-02-25T15:24:40+00:00   
2  2021-02-25T21:03:17+05:30  2021-02-25T15:33:17+00:00   
3  2021-02-25T21:07:39+05:30  2021-02-25T15:37:39+00:00   
4  2022-02-25T02:07:04+05:30  2022-02-24T20:37:04+00:00   

                    sender                                            message  
0       ~‚ÄØAadiitya Agarwal  https://www.instagram.com/reel/DC1yS8yRwYf/?ig...  
1      ~‚ÄØMustafa Anandwala  https://youtube.com/shorts/xVbU-6TUEmU?si=SP0_...  
2          ~‚ÄØPrithvi Kumar                Title made me clear browser history  
3  Sanket Reddit Badminton                                                  üòÇ  
4        ‚Ä™+91¬†93255¬†50855‚Ä¨  Some one posted that game list...can you pleas...  
