In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import re
import matplotlib
import matplotlib.pyplot as plt

In [2]:
def startsWithDate(s):
    pattern = '^([1-9]|[1-2][0-9]|(3)[0-1])(\/)([1-9]|(1)[0-2])(\/)(1)(9) ([0-9][0-9]|[0-9]):([0-9][0-9]) -'
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [3]:
def startsWithAuthor(s):
    patterns = [
        '([\w]+):',                        # First Name
        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        '([+]\d{2} \d{4} \d{7})'           # Mobile Number (Europe)
    ]
    pattern = '^' + '|'.join(patterns)
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [4]:
def getDataPoint(line):
    # line = 18/06/19, 22:47 - Daniki: Avui se surt
    
    splitLine = line.split(' - ') # splitLine = ['18/6/19, 22:47', 'Daniki: Avui se surt']
    
    dateTime = splitLine[0] # dateTime = '18/6/19, 22:47'
    
    date, time = dateTime.split() # date = '18/6/19'; time = '22:47'
    
    message = ' '.join(splitLine[1:]) # message = 'Daniki: Avui se surt'
    
    if startsWithAuthor(message): # True
        splitMessage = message.split(': ') # splitMessage = ['Daniki', 'Avui se surt']
        author = splitMessage[0] # author = 'Daniki'
        message = ' '.join(splitMessage[1:]) # message = 'Avui se surt?'
    else:
        author = None
    return date, time, author, message

In [6]:
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
conversationPath = 'Chat_Qehace.txt'
with open(conversationPath, encoding="utf-8") as fp:
    fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)
    fp.readline()
    fp.readline()
    messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
    date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
    
    while True:
        line = fp.readline() 
        if not line: # Stop reading further if end of file has been reached
            break
        line = line.strip() # Guarding against erroneous leading and trailing whitespaces
        if startsWithDate(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
            if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
                parsedData.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
            messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
            date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
            messageBuffer.append(message) # Append message to buffer
        else:
            messageBuffer.append(line) # If a line doesn't start with a Date Time pattern, then it is part of a multi-line message. So, just append to buffer

In [8]:
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])

In [9]:
null_authors_df = df[df['Author'].isnull()]
media_messages_df = df[df['Message'] == '<Multimedia omitido>']

In [11]:
messages_df = df.drop(null_authors_df.index)
messages_df = messages_df.drop(media_messages_df.index)

In [13]:
messages_df.head()

Unnamed: 0,Date,Time,Author,Message
0,7/3/19,15:03,Coto,Lokis ara he arribat a Lleida
1,7/3/19,15:08,Knuts,https://www.youtube.com/watch?v=SvXu4-WLJT8
2,7/3/19,15:09,Coto,Jajaja ja marxo
3,7/3/19,15:44,Daniki,Estic currant a la csa colonies jo
4,7/3/19,15:45,Daniki,Fins div d la setman vinent re
