In [1]:
import pandas as pd
import numpy as np
from telethon.sync import TelegramClient
from telethon.tl.types import PeerUser, PeerChat, PeerChannel, InputMessagesFilterPhotoVideo
from telethon.tl.functions.channels import GetFullChannelRequest
import json
import re
from telethon.errors import ChannelPrivateError
import pickle
import tqdm
import glob

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
with open('../credentials/credentials.pickle','rb') as file:
    api_id, api_hash, phone, username = pickle.load(file)

In [3]:
first_edgelist = pd.read_csv('../first_edgelist/first_edgelist.csv')

Let's start building a dictionnaire id_to_names and id_to_description.
We start from 'forward_to' of the first edgelist (there will be only Azione Incel and FDB)

In [4]:
with open('../dictionaires/id_to_names.json', 'r') as file:
    id_to_names = json.load(file)

with open('../dictionaires/id_to_desc.json', 'r') as file:
    id_to_desc = json.load(file)

In [5]:
id_to_names = dict(map(lambda x: (int(x[0]), x[1]),id_to_names.items())) #cast into integer
id_to_desc = dict(map(lambda x: (int(x[0]), x[1]),id_to_desc.items()))

In [6]:
unique_values = np.setdiff1d(first_edgelist['forward_from'], first_edgelist['forward_to'])
#get only values in "forward_from" that ARE NOT in "forward_to" because we already did every "forward_to"

In [7]:
unique_values

array([1049507817, 1049610340, 1114912074, 1187620806, 1190986829,
       1215292658, 1223843057, 1235541026, 1301711481, 1301962344,
       1309409559, 1317214607, 1323992627, 1332218554, 1366782055,
       1378694795, 1392759909, 1400689598, 1422594647, 1422646676,
       1550242102, 1564012790, 1572171702, 1603259782, 1606254367,
       1606625424, 1614269632, 1646446376, 1655979710, 1668678283,
       1700352017, 1724328030, 1731141767, 1768292617, 1772238161,
       1783986700, 1804905770, 1850631337, 1862724187, 1910155003,
       1958703743, 1969251461, 1970256187, 1973090987, 1997883763,
       2083183611, 2085605636])

In [14]:
id_to_desc

{1832870539: '',
 1259484976: 'Canale di informazione antifascista - Combatti la paura, distruggi il fascismo!\nPotete mandarci segnalazioni o contributi a: \n✨https://t.me/AzioneAntifaRmEst_Bot'}

In [15]:
import datetime

In [16]:
offset_date = datetime.datetime(2023,10,1)

In [17]:
def get_channels_forward(lista_messaggi, receving_id):
    """
    lista_messaggi: list of patched telethon messages
    receving_id: id of the scraped group. So the forwarded_to id

    returns list of tuple with format (forwarded_from, forwarded_to)
    where forwarded_to is the receving_id

    """
    lista_fwd = []
    for mex in lista_messaggi:
        dict_mex = mex.to_dict()
        if 'fwd_from' in dict_mex.keys() and dict_mex['fwd_from'] is not None:
            fwd_info = dict_mex['fwd_from']['from_id']
            if type(fwd_info) == dict and fwd_info['_']=='PeerChannel':
                lista_fwd.append((fwd_info['channel_id'], receving_id))

    return lista_fwd
        

In [18]:
async def get_data_channel(channel_id, offset_date, names_dict, desc_dict):
    """
    Get all messages from channel NEWER than offset date (e.g. sent after offset date)

    args:

    channel_id: id of channel to be scraped (has to be an integer!)
    offset_date: date limit 
    names_dict: dict of id-to-names
    desc_dict: dict of id-to-bio
    
    return list of patcthed messages
    """
    
    async with TelegramClient(username, api_id, api_hash) as client:
                entity = await client.get_entity(PeerChannel(channel_id))
                full_entity = await client(GetFullChannelRequest(channel=entity))

                names_dict[channel_id] = entity.title #get group name
                desc_dict[channel_id] = full_entity.full_chat.about #get group bio

                lista_mex = await client.get_messages(entity, reverse=True, offset_date=offset_date, limit=None)
    
    return lista_mex

In [19]:
if glob.glob('../temporary_data/*.pickle'):
    with open('../temporary_data/temporary_third_edgelist.pickle','rb') as file_edgelist,\
        open('../temporary_data/temporary_id_to_names.pickle','rb') as file_id_names,\
        open('../temporary_data/temporary_id_to_desc.pickle','rb') as file_id_desc,\
        open('../temporary_data/temporary_index.pickle','rb') as index_file:

            third_edgelist_list = pickle.load(file_edgelist)
            id_to_names = pickle.load(file_id_names)
            id_to_desc = pickle.load(file_id_desc)
            index = pickle.load(index_file)

else:
    second_edgelist_list = []
    index=0

In [20]:
index

0

In [22]:
start_value = index
final_value = len(unique_values)
for index in tqdm.tqdm(range(start_value,final_value)):
    codice = unique_values[index]
    try:
        list_messages = await get_data_channel(int(codice), offset_date, id_to_names, id_to_desc)
        list_tuples = get_channels_forward(list_messages, codice)
        second_edgelist_list.extend(list_tuples)
    except(ChannelPrivateError, ValueError) as e:
        print(e)

100%|██████████| 47/47 [08:51<00:00, 11.32s/it]


In [23]:
forward_from = list(map(lambda x: x[0], second_edgelist_list))
forward_to = list(map(lambda x: x[1], second_edgelist_list))

In [24]:
second_edgelist_df_raw = pd.DataFrame({'forward_from':forward_from,\
    'forward_to':forward_to})

In [27]:
#save edgelist
second_edgelist = second_edgelist_df_raw.value_counts().reset_index()
second_edgelist.to_csv('../second_edgelist/second_edgelist.csv',index=False)

In [28]:
#need to cast keys as int (instead of int64) to dump json
id_to_names_int = dict(map(lambda x: (int(x[0]),x[1]),id_to_names.items()))
id_to_desc_int = dict(map(lambda x: (int(x[0]),x[1]),id_to_desc.items()))

In [29]:
with open('../dictionaires/id_to_names.json','w') as file_names, \
    open('../dictionaires/id_to_desc.json','w') as file_desc:
    json.dump(id_to_names_int, file_names)
    json.dump(id_to_desc_int, file_desc)