In [6]:
import pandas as pd
import datetime
import numpy
import re

In [7]:
def startsWithDateAndTime(s):
    # regex pattern for date.(Works only for android. IOS Whatsapp export format is different. Will update the code soon
    pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9][0-9]), ([0-9]+):([0-9][0-9]) (AM|PM) -'
    result = re.match(pattern, s)
    if result:
        return True
    return False
  
# Finds username of any given format.
def FindAuthor(s):
    patterns = [
        '([\w]+):',                        # First Name
        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
        '([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
        '([\w]+)[\u263a-\U0001f999]+:',    # Name and Emoji              
    ]
    pattern = '^' + '|'.join(patterns)
    result = re.match(pattern, s)
    if result:
        return True
    return False
  
def getDataPoint(line):   
    splitLine = line.split(' - ') 
    dateTime = splitLine[0]
    date, time = dateTime.split(', ') 
    message = ' '.join(splitLine[1:])
    if FindAuthor(message): 
        splitMessage = message.split(': ') 
        author = splitMessage[0] 
        message = ' '.join(splitMessage[1:])
    else:
        author = None
    return date, time, author, message

parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
# Upload your file here
conversationPath = 'fychat.txt' # chat file
with open(conversationPath, encoding="utf-8") as fp:
    fp.readline() # Skipping first line of the file because contains information related to something about end-to-end encryption
    messageBuffer = [] 
    date, time, author = None, None, None
    while True:
        line = fp.readline() 
        if not line: 
            break
        line = line.strip() 
        if startsWithDateAndTime(line): 
            if len(messageBuffer) > 0: 
                parsedData.append([date, time, author, ' '.join(messageBuffer)]) 
            messageBuffer.clear() 
            date, time, author, message = getDataPoint(line) 
            messageBuffer.append(message) 
        else:
            messageBuffer.append(line)
   
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
df["Date"] = pd.to_datetime(df["Date"])

In [8]:
df.head()

Unnamed: 0,Date,Time,Author,Message
0,2020-02-12,6:32 PM,,"swapnil sir created group ""Freshers Cytron """
1,2020-02-12,6:32 PM,,swapnil sir added you
2,2020-02-12,6:33 PM,,swapnil sir changed this group's icon
3,2020-02-12,6:35 PM,swapnil sir,This group is for first year members. I'm prov...
4,2020-02-12,6:36 PM,swapnil sir,Arpit add everyone from first year except pce ...


In [9]:
df.dropna()

Unnamed: 0,Date,Time,Author,Message
3,2020-02-12,6:35 PM,swapnil sir,This group is for first year members. I'm prov...
4,2020-02-12,6:36 PM,swapnil sir,Arpit add everyone from first year except pce ...
21,2020-02-12,8:25 PM,+91 90799 06440,Welcome to Cytron Club Hello guys.. Kashish th...
22,2020-02-12,8:28 PM,Bhavya,"Bhavya here from 2nd year.. Congratulations, ..."
23,2020-02-18,1:31 PM,swapnil sir,https://instagram.com/stories/we.cytron/224640...
25,2020-03-03,3:42 PM,Bhavya,Hello guys!! Swapnil has left the group bcoz o...
27,2020-03-09,3:03 PM,Bhavya,Re-discover Peace and Happiness. Celebrate. ht...
30,2020-03-11,11:27 PM,Shashank Tak,Announcing our website upgrade take a tour and...
31,2020-03-11,11:32 PM,Shashank Tak,Prefer laptop for better experience😅
32,2020-03-11,11:39 PM,+91 86192 39406,Sir I am from E section (pgi) There is no E se...


In [10]:
df.Author.unique()

array([None, 'swapnil sir', '+91 90799 06440', 'Bhavya', 'Shashank Tak',
       '+91 86192 39406', '+91 95880 57797', '+91 73219 48250',
       '+91 89490 90660', '+91 820 985 1728', '+91 74248 98333',
       '+91 95115 17886', '+91 76270 26366', '+91 95218 44845'],
      dtype=object)