# Analyse Telegram Chat

Anlayse and visualize the exported messages from `Telegram Desktop`

-----

### Telegram Chat Export Tool

For documentation how to export chats head over to the doc **with video** at [blog of Telegram](https://www.telegram.org/blog/export-and-more)

-----


## Paramter

In [None]:
# The directory containing the data export
EXPORT_FOLDER = "~/Downloads/Telegram Desktop/ChatExport_04_01_2020/"


## Parse Telegram Data

In [None]:
"""
Author: Jan-Eike Golenia  @jagoleni
Source: https://github.com/jagoleni/tele-data/blob/master/tele-data.py
"""
import os

from lxml import etree

import pandas as pd
import matplotlib.pyplot as plt

def parse_file(html_string):
    data = []
    parser = etree.HTMLParser()
    root = etree.HTML(html_string)
    for element in root.iter():
        if "id" in element.attrib:
            message = {}
            message["message_id"] = element.attrib["id"]
            for child in element.getchildren():
                if element.attrib["class"] == "message service" and \
                    child.attrib["class"] == "body details":
                        message["text"] = child.text.strip()
                        message['type'] = 'service_message'
                if child.attrib["class"] == "body":
                    for grandchild in child.getchildren():
                        if grandchild.attrib["class"] == "from_name":
                            name = grandchild.text.strip()
                            message["name"] = name
                        if grandchild.attrib["class"] == "pull_right date details":
                            message['timestamp'] = grandchild.attrib["title"]
                        if grandchild.attrib["class"] == "text":
                            message['text'] = grandchild.text.strip()
                            message['type'] = 'text'
                        if grandchild.attrib["class"] == "forwarded body":
                            message['type'] = "forwarded_message"
                        if grandchild.attrib["class"] == "media_wrap clearfix":
                            message['type'] = \
                                grandchild.getchildren()[0].attrib["class"].split()[-1]
            if element.attrib["class"] == "message default clearfix joined":
                message["joined_message"] = True
                message["name"] = name
            if element.attrib["class"] == "message default clearfix":
                message["joined_message"] = False
            data.append(message)
    return data

data = []
filecount = 0
for fname in os.listdir(EXPORT_FOLDER):
    fpath = os.path.join(EXPORT_FOLDER, fname)
    if os.path.isfile(fpath) and os.path.splitext(fpath)[-1] == ".html":
        with open(fpath, encoding='utf8') as f:
            # print("Reading", fname, "...")
            data += parse_file(f.read())
            filecount += 1

df = pd.DataFrame(data)
df["timestamp"] = pd.to_datetime(df["timestamp"], format="%d.%m.%Y %H:%M:%S")
#plot_message_count_by_name(df)

print("Queried ", len(df.index), "raw messages from ", filecount, "files.")


In [None]:
# filter out all service_message
messages = df.loc[df['type'] == "text"]

In [None]:
# Display parsed messages
messages

-----

# Visualize

To setup Plotly with Jupyter Lab follow the instructions at [plotly > Getting Started](https://plot.ly/python/getting-started/) - especially the [Jupyter Lab instructions](https://plot.ly/python/getting-started/#jupyterlab-support-python-35)

In [None]:
import plotly.graph_objects as go

# Messages by date

In [None]:
# Date

dates = pd.DataFrame( messages['timestamp'].dt.date )
date_counts = pd.DataFrame( dates.stack().value_counts(sort=False) )

data = [go.Bar(x=date_counts.index, y=date_counts[0])]

fig = go.Figure(data)
fig.show()

# Messages by Time


In [None]:
# Date

hours = pd.DataFrame( pd.DatetimeIndex(messages['timestamp']).hour )

time_counts = pd.DataFrame( hours.stack().value_counts(sort=False) ) 

data = [go.Bar(x=time_counts.index, y=time_counts[0])]

fig = go.Figure(data)
fig.show()

In [None]:
lbls = list( time_counts.index.astype(str) )
data = [go.Pie(labels=lbls, values=time_counts[0], hole=.3, marker_colors=night_colors)]

fig.update_traces(textposition='inside', textinfo='label')
fig = go.Figure(data=data)
fig.show()

# Word count


In [None]:
# TODO ...