# Telegram Export file Analyzer

In [1]:
# используем улучшенный print
from pprint import pprint

## Анализируем статистику по ключам в сообщениях

In [4]:
import json

with open('result.json', encoding='utf-8') as f:
  export_data = json.loads(f.read())

In [6]:
# получаем все возможные ключи в сообщениях
# get keys in message that are common to all messages in the export

all_keys = set()
message_types = set()

keys_count = {}
total_messages = 0

for chat in export_data['chats']['list']:
    for message in chat['messages']:
        # print(set(message.keys()))
        all_keys = all_keys | set(message.keys())
        for key in message.keys():
            keys_count[key] = keys_count.get(key, 0) + 1
        total_messages += 1

# print(all_keys)

for chat in export_data['chats']['list']:
    for message in chat['messages']:
        # print(set(message.keys()))
        all_keys = all_keys & set(message.keys())
        message_types.add(message['type'])

# print(all_keys)
# print(message_types)

keys_count = {k: v / total_messages * 100 for k, v in keys_count.items()}
# sort map by value
keys_count = {k: round(v, 3) for k, v in sorted(
    keys_count.items(), key=lambda item: item[1], reverse=True)}
# keys_count = dict(sorted(keys_count.items(), key=lambda item: item[1], reverse=True))

pprint(keys_count, sort_dicts=False)


{'id': 100.0,
 'type': 100.0,
 'date': 100.0,
 'date_unixtime': 100.0,
 'text': 100.0,
 'text_entities': 100.0,
 'from': 99.725,
 'from_id': 99.725,
 'reply_to_message_id': 20.526,
 'edited': 18.395,
 'edited_unixtime': 18.395,
 'width': 14.349,
 'height': 14.349,
 'file': 11.818,
 'media_type': 10.669,
 'thumbnail': 8.027,
 'mime_type': 7.903,
 'duration_seconds': 7.748,
 'photo': 6.399,
 'forwarded_from': 5.629,
 'sticker_emoji': 3.88,
 'saved_from': 0.848,
 'action': 0.278,
 'actor': 0.275,
 'actor_id': 0.275,
 'discard_reason': 0.234,
 'via_bot': 0.06,
 'location_information': 0.035,
 'poll': 0.028,
 'message_id': 0.027,
 'live_location_period_seconds': 0.025,
 'title': 0.015,
 'performer': 0.013,
 'emoticon': 0.009,
 'contact_information': 0.003,
 'to': 0.003,
 'to_id': 0.003,
 'distance': 0.003,
 'game_title': 0.002,
 'game_description': 0.002,
 'game_link': 0.002,
 'contact_vcard': 0.002,
 'game_message_id': 0.001,
 'score': 0.001,
 'self_destruct_period_seconds': 0.001}


In [2]:
# ключи, которые есть во всех сообщениях
main_keys = {'date', 'id', 'text_entities', 'type', 'date_unixtime', 'text'}

# статистика встречаемости ключей в сообщениях
most_pop_keys = {'id': 100.0,
                 'type': 100.0,
                 'date': 100.0,
                 'date_unixtime': 100.0,
                 'text': 100.0,
                 'text_entities': 100.0,
                 'from': 99.725,
                 'from_id': 99.725,
                 'reply_to_message_id': 20.526,
                 'edited': 18.395,
                 'edited_unixtime': 18.395,
                 'width': 14.349,
                 'height': 14.349,
                 'file': 11.818,
                 'media_type': 10.669,
                 'thumbnail': 8.027,
                 'mime_type': 7.903,
                 'duration_seconds': 7.748,
                 'photo': 6.399,
                 'forwarded_from': 5.629,
                 'sticker_emoji': 3.88,
                 'saved_from': 0.848,
                 'action': 0.278,
                 'actor': 0.275,
                 'actor_id': 0.275,
                 'discard_reason': 0.234,
                 'via_bot': 0.06,
                 'location_information': 0.035,
                 'poll': 0.028,
                 'message_id': 0.027,
                 'live_location_period_seconds': 0.025,
                 'title': 0.015,
                 'performer': 0.013,
                 'emoticon': 0.009,
                 'contact_information': 0.003,
                 'to': 0.003,
                 'to_id': 0.003,
                 'distance': 0.003,
                 'game_title': 0.002,
                 'game_description': 0.002,
                 'game_link': 0.002,
                 'contact_vcard': 0.002,
                 'game_message_id': 0.001,
                 'score': 0.001,
                 'self_destruct_period_seconds': 0.001}

# вывести все ключи, которые встречаются в сообщениях более 1% раз
for k, v in most_pop_keys.items():
  if 100 > v >= 1:
    print(k)


from
from_id
reply_to_message_id
edited
edited_unixtime
width
height
file
media_type
thumbnail
mime_type
duration_seconds
photo
forwarded_from
sticker_emoji


In [None]:
print(export_data['chats']['list'][0]['messages'][:10], depth=4, width=120)  # просто для дебага
# print(export_data['contacts']['list'][:10], depth=2, width=120)

In [None]:
# выводим значение и тип ключа в нескольких сообщениях
# чтобы понять, какой тип и какие значения могут быть у ключа

key = 'sticker_emoji'
count = 0

for chat in export_data['chats']['list']:
    for message in chat['messages']:
        if key in message.keys():
            count += 1
            print(type(message[key]))
            print(message[key])
            print(message)
            print('------------------')
            if count > 10:
                break

## Pydantic модельки

In [120]:
from typing import List, Dict, Generic, TypeVar
from pydantic import BaseModel, Field, NoneStr
from pydantic.generics import GenericModel
from datetime import datetime
from enum import Enum

NoneInt = int | None


class ChatType(str, Enum):
    saved_messages = 'saved_messages'
    personal_chat = 'personal_chat'


class ContactCategory(str, Enum):
    people = 'people'
    bots = 'inline_bots'
    calls = 'calls'


class MessageType(str, Enum):
    message = 'message'
    service = 'service'


class MediaType(str, Enum):
    animation = "animation"
    video_file = "video_file"
    video_message = "video_message"
    voice_message = "voice_message"
    audio_file = "audio_file"
    sticker = "sticker"


class Message(BaseModel):
    date: datetime
    id_: int = Field(..., alias='id')
    text_entities: List[Dict]
    type: MessageType
    text: str | List[Dict | str]
    # other fields is optional

    # fields with > 1% count:
    from_: NoneStr = Field(None, alias='from')
    from_id: NoneStr
    reply_to_message_id: NoneInt
    edited: datetime | None
    file: NoneStr
    thumbnail: NoneStr
    media_type: MediaType | None
    mime_type: NoneStr
    photo: NoneStr
    width: NoneInt
    height: NoneInt
    duration_seconds: NoneInt
    forwarded_from: NoneStr
    sticker_emoji: NoneStr

    def getPlainText(self):
        if not self.text:
            return ''
        if isinstance(self.text, str):
            return self.text
        return ''.join(e['text'] for e in self.text_entities)
        # примечание: если в сообщении есть ссылка, будет сохранено только ее название, а не сама ссылка (href)


class Contact(BaseModel):
    date: str
    date_unixtime: int
    first_name: str
    last_name: str
    phone_number: str


class FrequentContact(BaseModel):
    id_: int = Field(..., alias='id')
    name: str
    rating: float
    category: ContactCategory


TData = TypeVar('TData')


class AboutWrapper(GenericModel, Generic[TData]):
    about: str
    list: List[TData]


# две главные модели

class TelegramChat(BaseModel):
    id_: int = Field(..., alias='id')
    messages: List[Message]
    name: NoneStr
    type_: ChatType = Field(..., alias='type')


class TelegramExport(BaseModel):
    chats: AboutWrapper[TelegramChat]
    contacts: AboutWrapper[Contact]
    frequent_contacts: AboutWrapper[FrequentContact]


In [99]:
parsed_data = TelegramExport.parse_file('result.json')  # экспорт всех чатов

In [19]:
parsed_chat = TelegramChat.parse_file('sergey/result.json')  # экспорт одного чата

In [None]:
# находим чат где около 1000 сообщений
i = 0
while len(parsed_data.chats.list[i].messages) > 1000:
    i += 1

# выводим первые сообщения
print(parsed_data.chats.list[i].messages)

In [279]:
from random import choice

# случайный чат
random_chat = choice(parsed_data.chats.list)

print("chat name:", random_chat.name)
print("chat id:", random_chat.id_)
print()

# случайное сообщение
random_message = choice(random_chat.messages)

pprint(random_message.dict(), depth=4, width=120)

print()
print('Message text:')
print(random_message.getPlainText())

chat name: Сергей
chat id: 254968821

{'date': datetime.datetime(2022, 9, 9, 17, 24, 32),
 'duration_seconds': None,
 'edited': None,
 'file': None,
 'forwarded_from': None,
 'from_': 'Сергей Холодильник',
 'from_id': 'user254968821',
 'height': None,
 'id_': 281548,
 'media_type': None,
 'mime_type': None,
 'photo': None,
 'reply_to_message_id': None,
 'sticker_emoji': None,
 'text': 'Ну типа',
 'text_entities': [{'text': 'Ну типа', 'type': 'plain'}],
 'thumbnail': None,
 'type': <MessageType.message: 'message'>,
 'width': None}

Message text:
Ну типа


In [None]:
# для тестов

chat_id = 254968821
message_id = 281548

# find chat by id
chat = next(chat for chat in parsed_data.chats.list if chat.id_ == chat_id)

# find message by id
message = next(message for message in chat.messages if message.id_ == message_id)

pprint(message.dict(), depth=4, width=120)

print()
print('Message text:')
print(message.getPlainText())