# Analyse Telegram Chat

Anlayse and visualize the exported messages from `Telegram Desktop`

-----

### Telegram Chat Export Tool

For documentation how to export chats head over to the doc **with video** at [blog of Telegram](https://www.telegram.org/blog/export-and-more)

-----


## Paramter

In [None]:
# The directory containing the data export
# EDIT the date to match your export!!!

from pathlib import Path
HOME_PATH = str(Path.home()) # Python 3.5+
EXPORT_FOLDER = HOME_PATH + "/Downloads/Telegram Desktop/ChatExport_20_01_2020/"


In [None]:
# Store the generated charts to this folder
OUTPUT_DIR = "CHARTS/" # Note the "/"" at the end!

## Parse Telegram Data

In [None]:
"""
Author: Jan-Eike Golenia  @jagoleni
Source: https://github.com/jagoleni/tele-data/blob/master/tele-data.py
"""
import os

from lxml import etree

import pandas as pd

def parse_file(html_string):
    data = []
    parser = etree.HTMLParser()
    root = etree.HTML(html_string)
    for element in root.iter():
        if "id" in element.attrib:
            message = {}
            message["message_id"] = element.attrib["id"]
            for child in element.getchildren():
                if element.attrib["class"] == "message service" and \
                    child.attrib["class"] == "body details":
                        message["text"] = child.text.strip()
                        message['type'] = 'service_message'
                if child.attrib["class"] == "body":
                    for grandchild in child.getchildren():
                        if grandchild.attrib["class"] == "from_name":
                            name = grandchild.text.strip()
                            message["name"] = name
                        if grandchild.attrib["class"] == "pull_right date details":
                            message['timestamp'] = grandchild.attrib["title"]
                        if grandchild.attrib["class"] == "text":
                            message['text'] = grandchild.text.strip()
                            message['type'] = 'text'
                        if grandchild.attrib["class"] == "forwarded body":
                            message['type'] = "forwarded_message"
                        if grandchild.attrib["class"] == "media_wrap clearfix":
                            message['type'] = \
                                grandchild.getchildren()[0].attrib["class"].split()[-1]
            if element.attrib["class"] == "message default clearfix joined":
                message["joined_message"] = True
                message["name"] = name
            if element.attrib["class"] == "message default clearfix":
                message["joined_message"] = False
            data.append(message)
    return data

data = []
filecount = 0
for fname in os.listdir(EXPORT_FOLDER):
    fpath = os.path.join(EXPORT_FOLDER, fname)
    if os.path.isfile(fpath) and os.path.splitext(fpath)[-1] == ".html":
        with open(fpath, encoding='utf8') as f:
            # print("Reading", fname, "...")
            data += parse_file(f.read())
            filecount += 1

df = pd.DataFrame(data)
df["timestamp"] = pd.to_datetime(df["timestamp"], format="%d.%m.%Y %H:%M:%S")
#plot_message_count_by_name(df)

print("Queried ", len(df.index), "raw messages from ", filecount, "files.")


In [None]:
# filter out all service_message
messages = df.loc[df['type'] == "text"]

In [None]:
# Display parsed messages
messages

-----

# Visualize

To setup Plotly with Jupyter Lab follow the instructions at [plotly > Getting Started](https://plot.ly/python/getting-started/) - especially the [Jupyter Lab instructions](https://plot.ly/python/getting-started/#jupyterlab-support-python-35)

In [None]:
# Install wordcloud
#!conda install -c conda-forge wordcloud -y

In [None]:
import plotly.graph_objects as go

# Total Messages

In [None]:
print("Total text messages: ", len(messages.index))

# Messages by date

In [None]:
# Date

dates = pd.DataFrame( messages['timestamp'].dt.date )
date_counts = pd.DataFrame( dates.stack().value_counts(sort=False) )

data = [go.Bar(x=date_counts.index, y=date_counts[0])]

# Plot chart
fig = go.Figure(data)
fig.show()
# Save to file
filename = "Telegram_by_date.png"
fig.write_image(OUTPUT_DIR + filename)

In [None]:
# Year

years = pd.DataFrame( messages['timestamp'].dt.year )
year_counts = pd.DataFrame( years.stack().value_counts(sort=False) )

data = [go.Bar(x=year_counts.index, y=year_counts[0])]

# Plot chart
fig = go.Figure(data)
fig.show()
# Save to file
filename = "Telegram_by_year.png"
fig.write_image(OUTPUT_DIR + filename)

# Messages by Time


In [None]:
# Time

hours = pd.DataFrame( pd.DatetimeIndex(messages['timestamp']).hour )

time_counts = pd.DataFrame( hours.stack().value_counts(sort=False) ) 

data = [go.Bar(x=time_counts.index, y=time_counts[0])]

# Plot chart
fig = go.Figure(data)
fig.show()
# Save to file
filename = "Telegram_by_time.png"
fig.write_image(OUTPUT_DIR + filename)

lbls = list( time_counts.index.astype(str) )
data = [go.Pie(labels=lbls, values=time_counts[0], hole=.3)] #, marker_colors=night_colors)]

fig.update_traces(textposition='inside', textinfo='label')
fig = go.Figure(data=data)
fig.show()

# Word count


In [None]:
message_texts = messages['text']

In [None]:
words = []
for txt in message_texts:
    word_tokens = txt.split()
    words += word_tokens
    
message_text_flat = " ".join(words)

print("Total words: ", len(words))
print("Words per message: ", len(words) / len(messages))

In [None]:
# Based on: https://amueller.github.io/word_cloud/auto_examples/single_word.html#sphx-glr-auto-examples-single-word-py

import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud


#x, y = np.ogrid[:300, :300]
x, y = np.ogrid[:1000, :1000]

#mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = (x - 500) ** 2 + (y - 500) ** 2 > 400 ** 2
mask = 255 * mask.astype(int)


wc = WordCloud(background_color="white", repeat=True, mask=mask)
wc.generate(message_text_flat)

# store to file
wc.to_file(OUTPUT_DIR + "Telegram_cloud.png")

# show
plt.axis("off")
plt.imshow(wc, interpolation="bilinear")
plt.show()

### Emoji Count

In [None]:
# extended Count emojis
# Also counts "Emoji words" (multiple Emojis in a row)

import emoji

TEST_STR = "Get Emoji — All Emojis to ️ Copy and 📋 Paste 👌"

emoji_total_count = 0
emoji_word_total_count = 0
emoji_counts = {}
emoji_word_counts = {}
emoji_word = ""
#for idx, ch in message_text_flat:
for idx in range(len(message_text_flat)):
    ch = message_text_flat[idx]
    if ch in emoji.unicode_codes.UNICODE_EMOJI:
        # Character is an Emoji
        emoji_total_count += 1
        emoji_word += ch # Append current Emoji to Emoji word
        if ch in emoji_counts:
            # Increment existing entry
            emoji_counts[ch] += 1
        else:
            # Create new enty in dictionary
            emoji_counts[ch] = 1
    elif ch == " ":
         # Ignore spaces between emojis
        ;
    elif len(emoji_word) > 1:
        # Characters (etc.) terminate Emoji word
        emoji_word_total_count += 1
        if emoji_word in emoji_counts:
            # Increment existing entry
            emoji_word_counts[emoji_word] += 1
        else:
            # Create new enty in dictionary
            emoji_word_counts[emoji_word] = 1
        emoji_word = "" # Start over Emoji word

print("Found", emoji_total_count, "emojis.")
print("Found", emoji_word_total_count, "emoji words.")
# emoji_counts

### Emojis

In [None]:
# Visualize Emojis

emoji_counts_df = pd.DataFrame( list(emoji_counts.items()) ) 
# Sort by frequency / count
emoji_counts_df.sort_values(1, ascending=False, inplace=True) # Sort by col

data = [go.Bar(x=emoji_counts_df[0], y=emoji_counts_df[1])]

# Plot chart
fig = go.Figure(data)
fig.show()
# Save to file
filename = "Telegram_emojis.png"
fig.write_image(OUTPUT_DIR + filename)

### Emoji Words

In [None]:
# Visualize Emoji words

emoji_word_counts_df = pd.DataFrame( list(emoji_word_counts.items()) ) 
# Sort by length of Emoji word
emoji_word_counts_df['length'] = emoji_word_counts_df[0].str.len()
emoji_word_counts_df.sort_values('length', ascending=False, inplace=True)

data = [go.Bar(x=emoji_word_counts_df[0], y=emoji_word_counts_df[1])]

# Plot chart
fig = go.Figure(data)
fig.show()
# Save to file
filename = "Telegram_emoji_words.png"
fig.write_image(OUTPUT_DIR + filename)