<a href="https://colab.research.google.com/github/rabiyulfahimhasim786/whatsapp-text2csv/blob/main/whatsapptext2csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Whatsapptext to csv**

1- The format of the dates don't always have two digits for days and months, but it always has two digits for years. I adjusted the regex to reflect it:

r'^(\d+/\d+/\d\d.*?)(?=^^\d+/\d+/\d\d,*?)'

2- The end of the datatime field has either AM or PM in capital letters:

s = re.search('M - (.*?):', row).group(1)

3 - The datetime format is actually month/day/year:

df['timestamp'] = pd.to_datetime(df.timestamp, format='%m/%d/%y, %I:%M %p')

In [1]:

import pandas as pd
import re

def parse_file(FULL_PATH):
    '''Convert WhatsApp chat log text file to a Pandas dataframe.'''

    # some regex to account for messages taking up multiple lines
    pat = re.compile(r'^(\d+\/\d+\/\d\d.*?)(?=^^\d+\/\d+\/\d\d\,\*?)', re.S | re.M)
    with open(FULL_PATH, encoding = 'utf8') as raw:
        data = [m.group(1).strip().replace('\n', ' ') for m in pat.finditer(raw.read())]
    
    sender = []; message = []; datetime = []
    for row in data:

        # timestamp is before the first dash
        datetime.append(row.split(' - ')[0])

        # sender is between am/pm, dash and colon
        try:
            s = re.search('M - (.*?):', row).group(1)
            sender.append(s)
        except:
            sender.append('')

        # message content is after the first colon
        try:
            message.append(row.split(': ', 1)[1])
        except:
            message.append('')

    df = pd.DataFrame(zip(datetime, sender, message), columns=['timestamp', 'sender', 'message'])
    df['timestamp'] = pd.to_datetime(df.timestamp, format='%m/%d/%y, %I:%M %p')

    # remove events not associated with a sender
    df = df[df.sender != ''].reset_index(drop=True)

    return df

df = parse_file('whatsapp.txt')

In [4]:
df.head(5)

Unnamed: 0,timestamp,sender,message
0,2023-02-15 01:05:00,+91 79930 21302,Given below 7 watsapp group kindly joined 👍 ...
1,2023-02-26 02:53:00,+91 79930 21302,Given below 7 watsapp group kindly joined 👍 ...
2,2023-03-07 01:19:00,+91 79930 21302,"𝐈𝐓 𝐄𝐱𝐩𝐞𝐫𝐢𝐞𝐧𝐜𝐞 𝐃𝐨𝐜𝐮𝐦𝐞𝐧𝐭𝐬, *FORM-16,* 𝐏𝐅 & *Al..."
3,2023-03-20 14:42:00,+91 79930 21302,"𝐈𝐓 𝐄𝐱𝐩𝐞𝐫𝐢𝐞𝐧𝐜𝐞 𝐃𝐨𝐜𝐮𝐦𝐞𝐧𝐭𝐬, *FORM-16,* 𝐏𝐅 & *Al..."
4,2023-03-21 01:59:00,+91 79930 21302,<Media omitted>


In [5]:
# To clean and convert a whatsapp txt file export to a CSV file

import pandas as pd

# read file by lines
file_path = "whatsapp.txt"
f = open(file_path, 'r')
data = f.readlines()
f.close()

# sanity stats
print('num lines: %s' %(len(data)))

# parse text and create list of lists structure
# remove first whatsapp info message
dataset = data[1:]
cleaned_data = []
for line in dataset:
	# grab the info and cut it out
	date = line.split(",")[0]
	line2 = line[len(date):]
	time = line2.split("-")[0][2:]
	line3 = line2[len(time):]
	name = line3.split(":")[0][4:]
	line4 = line3[len(name):]
	message = line4[6:-1] # strip newline charactor

	#print(date, time, name, message)
	cleaned_data.append([date, time, name, message])

  
# Create the DataFrame 
df = pd.DataFrame(cleaned_data, columns = ['Date', 'Time', 'Name', 'Message']) 

# check formatting 
if 0:
	print(df.head())
	print(df.tail())

# Save it!
df.to_csv('converted_messages.csv', index=False)



num lines: 225


In [6]:
df.head(5)

Unnamed: 0,Date,Time,Name,Message
0,2/15/23,1:04 AM,+91 79810 42322 joined using this group's invi...,
1,2/15/23,1:05 AM,+91 79930 21302,Given below 7 watsapp group kindly joined 👍
2,\n,,,
3,\n,,,
4,Oracle & SAP Instance access available below...,,,
