In [1]:
from telegram_models import TelegramExport
from pprint import pprint

In [2]:
chats = TelegramExport.parse_file('result.json').chats

In [3]:
chat = next(chat for chat in chats if chat.name == 'Сергей')

pprint(chat.dict(), indent=2, depth=1)

{ 'id_': 254968821,
  'messages': [...],
  'name': 'Сергей',
  'type_': <ChatType.Chat: 'personal_chat'>}


In [6]:
chunks = []

# до определенного промежутка мы считаем, что все сообщения относятся к одному под-диалогу
# после определенного промежутка мы считаем, что все сообщения относятся к другому под-диалогу
# между этими промежутками мы доверяем это решение нейросети которая вычисляет specifity ответа на сообщение
# причем делаем с определенным процентом доверенности
# (4 часа -> думаем, что сообщения относятся к одному под-диалогу, но если нейросеть думает сильно иначе, то к разным)
# (8 часов -> наоборот, с большей вероятностью думаем что сообщения из разных диалогов, но если нейросеть нас переубедит, то из одного)

chunks.append(chat.messages[0])

def is_relevant(time_delta):
    from math import cos, pi
    start = 3
    end = 9
    up_bound = 0.95
    down_bound = 0.05

    if time_delta < start:
        return down_bound
    elif time_delta > end:
        return up_bound
    else:
        d = up_bound - down_bound
        v = cos(pi * (time_delta - start) / (end - start))
        v = d * (1 - v) / 2 + down_bound
        return v
    

for i in range(1, 10): # len(chat.messages)):
    delta = chat.messages[i].date - chat.messages[i - 1].date
    delta_hours = delta.total_seconds() / 3600
    print(round(delta_hours, 2), is_relevant(delta_hours))


0.17 0.05
0.9 0.05
2.6 0.05
4.76 0.22740935865127454
0.0 0.05
0.18 0.05
20.99 0.95
0.0 0.05
0.46 0.05


In [6]:
import altair as alt
import pandas as pd

df = pd.DataFrame({'x': [i / 20 for i in range(240)], 'y': [is_relevant(i /20) for i in range(240)]})

# add tooltip to graph
alt.Chart(df).mark_line().encode(
    x='x',
    y='y',
    tooltip=['x', 'y']
)

## Some Cool visualizations of the data

In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'tinkoff-ai/response-quality-classifier-large'

# globals for classification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


In [74]:
def get_relevance_specificity(msg_list):
    count = len(msg_list)
    if count < 2 or count > 4:
        return None, None
    text = '[CLS]{}' + '[SEP]{}' * (count - 2) + '[RESPONSE_TOKEN]{}'
    inputs = tokenizer(text.format(
        *msg_list), add_special_tokens=False, return_tensors='pt')
    with torch.inference_mode():
        logits = model(**inputs).logits
        probas = torch.sigmoid(logits)[0].cpu().detach().numpy()
    return probas

# TODO: better function
def get_slice_stats():
    # will return: relevance, specificity, delta, relevance_model_weight
    pass


In [58]:
get_relevance_specificity([msg.text for msg in chat.messages[:4]])

array([0.19573678, 0.26578358], dtype=float32)

In [222]:
temp = list(range(10))

for i in range(1, len(temp)):
    slice = temp[max(i - 3, 0): i + 1]
    print(slice)

[0, 1]
[0, 1, 2]
[0, 1, 2, 3]
[1, 2, 3, 4]
[2, 3, 4, 5]
[3, 4, 5, 6]
[4, 5, 6, 7]
[5, 6, 7, 8]
[6, 7, 8, 9]


In [228]:
def get_message_with_context(msg_list, i, context_size=3):
    return msg_list[max(i - context_size, 0): i + 1]


temp = list(range(10))
for i in range(0, len(temp)):
    slice = get_message_with_context(temp, i, 3)
    print(slice)


[0]
[0, 1]
[0, 1, 2]
[0, 1, 2, 3]
[1, 2, 3, 4]
[2, 3, 4, 5]
[3, 4, 5, 6]
[4, 5, 6, 7]
[5, 6, 7, 8]
[6, 7, 8, 9]


In [None]:
from tqdm import trange, tqdm

relevance_stats = []

# TODO: optimize to run in Colab
for i in trange(10):
    slice = get_message_with_context(chat.messages, i)
    slice = [msg.text for msg in slice]
    # print(str(slice))
    relevance, specificity = get_relevance_specificity(slice)
    relevance_stats.append(
        {
            'i': i,
            'id': chat.messages[i].id_,
            'relevance': relevance,
            'specificity': specificity
        }
    )

relevance and specificity computed in Google Colab 10x faster  
and dumped to disk for later use in the notebook

```python
import pickle

# Serialize the object using dumps()
serialized_object = pickle.dumps(relevance_stats)

# Write the serialized object to a file using dump()
with open('/content/gdrive/MyDrive/result.pickle', 'wb') as handle:
    pickle.dump(serialized_object, handle)
```

In [4]:
import pickle

# Read the serialized object from a file using load()
with open('result.pickle', 'rb') as handle:
    serialized_object = pickle.load(handle)

# Deserialize the object using loads()
relevance_stats = pickle.loads(serialized_object)
len(relevance_stats)

45931

In [5]:
relevance_stats = [
    {
        'i': 0,
        'id': chat.messages[0].id_,
        'from': chat.messages[0].from_,
        'relevance': 0,
        'specificity': 0
    }
] + relevance_stats

In [6]:
relevance_stats[0]['delta'] = 0

for i in range(1, len(relevance_stats)):
    relevance_stats[i]['from'] = chat.messages[i].from_
    relevance_stats[i]['delta'] = chat.messages[i].date - chat.messages[i - 1].date
    relevance_stats[i]['delta'] = relevance_stats[i]['delta'].total_seconds()

In [7]:
relevance_stats[:3]

[{'i': 0,
  'id': 75149,
  'from': 'Сергей Холодильник',
  'relevance': 0,
  'specificity': 0,
  'delta': 0},
 {'i': 1,
  'id': 75151,
  'relevance': 0.6474697,
  'specificity': 0.7512167,
  'from': 'Сергей Холодильник',
  'delta': 609.0},
 {'i': 2,
  'id': 75153,
  'relevance': 0.26715553,
  'specificity': 0.2812941,
  'from': 'Сергей Холодильник',
  'delta': 3236.0}]

In [29]:
relevance_stats[0]

{'i': 0,
 'id': 75149,
 'from': 'Сергей Холодильник',
 'relevance': 0,
 'specificity': 0,
 'delta': 0}

In [8]:
# make a dataframe out of relevance stats
import pandas as pd
df = pd.DataFrame(relevance_stats)
df

Unnamed: 0,i,id,from,relevance,specificity,delta
0,0,75149,Сергей Холодильник,0.000000,0.000000,0.0
1,1,75151,Сергей Холодильник,0.647470,0.751217,609.0
2,2,75153,Сергей Холодильник,0.267156,0.281294,3236.0
3,3,75180,Cyber Potato,0.195737,0.265784,9355.0
4,4,75326,Cyber Potato,0.195737,0.265784,17126.0
...,...,...,...,...,...,...
45927,45927,342269,Cyber Potato,0.483616,0.430553,14.0
45928,45928,342270,Cyber Potato,0.184772,0.133466,2.0
45929,45929,342271,Сергей Холодильник,0.229009,0.043367,2.0
45930,45930,342272,Сергей Холодильник,0.175168,0.080261,66.0


In [9]:
import altair as alt

# pick random 10% of rows in new dataframe
graph_data = df.sample(frac=0.1, random_state=42)
graph_data = graph_data[graph_data['relevance'].notna() & graph_data['delta'] > 0]

alt.Chart(graph_data).mark_circle(size=60, opacity=0.1).encode(
    x=alt.X('delta', scale=alt.Scale(type='log', base=2), axis=alt.Axis(title='Time delta (hours)', tickCount=5)),
    y='relevance',
    tooltip=['i', 'delta', 'relevance', 'specificity']
).interactive()


In [None]:
%pip uninstall altair -y
%pip install altair==4.1.0

In [37]:
alt.Chart(graph_data).transform_calculate(
    log_delta = 'log(datum.delta)/log(10)'
).mark_rect().encode(
    alt.X("log_delta:Q", bin=alt.Bin(maxbins=50), title="Time delta (hours)"),
    alt.Y("relevance:Q", bin=alt.Bin(maxbins=50), title="Relevance"),
    alt.Color("count()", scale=alt.Scale(scheme="greenblue", type='log', base=2), title="Count"),
    tooltip=['average(relevance)', 'average(specificity)']
).properties(
    title="2D Histogram Heatmap",
    height=400,
    width=600,
)


In [240]:
alt.Chart(graph_data).mark_bar().encode(
    alt.X("relevance:Q", bin=True),
    y='count()',
    color='average(specificity)',
    tooltip=['count()', 'average(relevance)', 'average(specificity)']
)

## Main Algorithm

```python
thereshold = relevance_model_weight / 2

# is message relevant to last N?
if (relevance_from_model >= (1 - thereshold)):
    return True
elif (relevance_from_model < thereshold):
    return False
else:
    return relevance_model_weight >= 0.5
```

In [38]:
from telegram_models import Message
from math import cos, pi


def relevance_model_weight(time_delta):
    time_start = 4 * 60
    time_end = 9 * 60 * 60
    up_bound = 0.95
    down_bound = 0.05

    if time_delta < time_start:
        return down_bound
    elif time_delta > time_end:
        return up_bound
    else:
        d = up_bound - down_bound
        v = cos(pi * (time_delta - time_start) / (time_end - time_start))
        v = d * (1 - v) / 2 + down_bound
        return v


def decide_relevance(chunk: list[Message]):
    delta = chunk[-1].date - chunk[-2].date
    model_weight = relevance_model_weight(delta.total_seconds())
    relevance, _ = get_relevance_specificity([msg.text for msg in chunk])
    thereshold = model_weight / 2

    if relevance >= (1 - thereshold):
        return True
    elif relevance <= thereshold:
        return False
    else:
        return model_weight >= 0.5
    
# TODO: test this sh*t