In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
import datetime as dt
import re

%matplotlib inline

In [None]:
dateregex = re.compile(r"(?<=\[)[0-9., :]*(?=\] )")

In [None]:
with open('_chat.txt', 'r', encoding='utf-8') as f:
  chat = f.read()
  chat = chat.replace('\u200e', '')
  chatlines = chat.split('\n')
  chatlines = [line.strip() for line in chatlines]

In [None]:
timestamps = []
senders = []
messages = []
groupname = None

for line in chatlines:
  datestr = dateregex.search(line)
  if datestr:
    datestr = datestr.group()
    date = dt.datetime.strptime(datestr, "%d.%m.%y, %H:%M:%S")
    sender, message = line.split("] ", 1)[1].split(": ", 1)
    if message.startswith("You joined using this group's invite link"):
      groupname = sender
    timestamps.append(date)
    senders.append(sender)
    messages.append(message)
  else:
    messages[-1] += "\n" + line

# remove messages from group name
removed = 0
for i in range(len(messages)):
  newindex = i - removed
  if senders[newindex] == groupname:
    removed += 1
    timestamps.pop(newindex)
    senders.pop(newindex)
    messages.pop(newindex)

messages = pd.DataFrame({"timestamp": timestamps, "sender": senders, "message": messages})

In [None]:
# Aggregate data

# Message count by sender
messagecount = messages['sender'].value_counts().sort_values(ascending=False)

senders = messagecount.index.tolist()

# Message count by sender by day and sort the same order as above
messagedaycount = messages.groupby([messages['timestamp'].dt.date, 'sender']).size().reset_index(name='messagecount')

In [None]:
# Allocate a color to each sender
colors = plt.cm.rainbow(np.linspace(1, 0, len(senders)))

In [None]:
# Pie Chart: Message count by sender
plt.figure(figsize=(6, 6))

plt.title("Message count by sender")
plt.pie(messagecount.values, labels=messagecount.index, colors=colors, autopct=lambda x: '{:.0f}'.format(x*messagecount.values.sum()/100) )
plt.axis('equal')

plt.show()

In [None]:
# Line Chart: Message count by sender by day
plt.figure(figsize=(12, 6))

plt.title("Message count by sender by day")
plt.xlabel("Date")
plt.ylabel("Message count")
for sender in senders[::-1]:
  sender_data = messagedaycount[messagedaycount['sender'] == sender]
  color = colors[::-1][senders[::-1].index(sender)]
  plt.plot(sender_data['timestamp'], gaussian_filter1d(sender_data['messagecount'], sigma=1), color=color, label=sender)

# Legend
handles, labels = plt.gca().get_legend_handles_labels()
order = range(0, len(handles))[::-1]
plt.legend([handles[idx] for idx in order],[labels[idx] for idx in order]) 

plt.show()