In [81]:
import pandas as pd
import numpy as np
from telethon.sync import TelegramClient
from telethon.tl.types import PeerUser, PeerChat, PeerChannel, InputMessagesFilterPhotoVideo
from telethon.tl.functions.channels import GetFullChannelRequest
import json
import re
from telethon.errors import ChannelPrivateError
import pickle

In [82]:
with open('../credentials/credentials.pickle','rb') as file:
    api_id, api_hash, phone, username = pickle.load(file)

In [83]:
first_edgelist = pd.read_csv('../first_edgelist/first_edgelist_final.csv')

Let's start building a dictionnaire id_to_names and id_to_description.
We start from 'forward_to' of the first edgelist (there will be only Azione Incel and FDB)

In [84]:
id_to_names = {}
id_to_description = {}

In [85]:
#check on group codes
for channel_id in first_edgelist['forward_to'].unique():
    async with TelegramClient(username, api_id, api_hash) as client:
                    entity = await client.get_entity(PeerChannel(channel_id))
                    full_entity = await client(GetFullChannelRequest(channel=entity))
                    id_to_names[channel_id] = entity.title
                    id_to_description[channel_id] = full_entity.full_chat.about


In [86]:
id_to_names

{1767430683: 'Ex Azione Incel - 2° gruppo', 2084686607: 'F.D.B.'}

In [87]:
id_to_description

{1767430683: 'Chat ufficiale di Azione Incel, movimento socioculturale fondato il 19/07/2022\n\nIncel e redpill di tutta la nazione, unitevi per la rivoluzione! xd\n\n*NO PEDO E NO VIRUS\n*No porno e gore\n*No NP(donne)\n*No insulti,spam,flood\n*No messaggi troppo imbarazzanti',
 2084686607: 'ARCHIVIO: https://ilforumdeibrutti.forumfree.it/m/\nSITO: https://ilforumdeibrutti.is/'}

# Secon edgelist!

In [88]:
unique_values = np.setdiff1d(first_edgelist['forward_from'],first_edgelist['forward_to'])
#we get all unique values of 'forward_from' that are NOT in 'forward_to' (because we'll always have already scraped messages from channels in "forward_to")
#these are the channels we'll scrape data from

In [89]:
import datetime

In [91]:
offset_date = datetime.datetime(2023,10,1)

In [99]:
def get_channels_forward(lista_messaggi, receving_id):
    """
    lista_messaggi: list of patched telethon messages
    receving_id: id of the scraped group. So the forwarded_to id

    returns list of tuple with format (forwarded_from, forwarded_to)
    where forwarded_to is the receving_id

    """
    lista_fwd = []
    for mex in lista_messaggi:
        dict_mex = mex.to_dict()
        if 'fwd_from' in dict_mex.keys() and dict_mex['fwd_from'] is not None:
            fwd_info = dict_mex['fwd_from']['from_id']
            if type(fwd_info) == dict and fwd_info['_']=='PeerChannel':
                lista_fwd.append((fwd_info['channel_id'], receving_id))

    return lista_fwd
        

In [94]:
id_to_names, id_to_description

({1767430683: 'Ex Azione Incel - 2° gruppo', 2084686607: 'F.D.B.'},
 {1767430683: 'Chat ufficiale di Azione Incel, movimento socioculturale fondato il 19/07/2022\n\nIncel e redpill di tutta la nazione, unitevi per la rivoluzione! xd\n\n*NO PEDO E NO VIRUS\n*No porno e gore\n*No NP(donne)\n*No insulti,spam,flood\n*No messaggi troppo imbarazzanti',
  2084686607: 'ARCHIVIO: https://ilforumdeibrutti.forumfree.it/m/\nSITO: https://ilforumdeibrutti.is/'})

In [95]:
async def get_data_channel(channel_id, offset_date, names_dict, desc_dict):
    """
    Get all messages from channel NEWER than offset date (e.g. sent after offset date)

    args:

    channel_id: id of channel to be scraped (has to be an integer!)
    offset_date: date limit 
    names_dict: dict of id-to-names
    desc_dict: dict of id-to-bio
    
    return list of patcthed messages
    """
    
    async with TelegramClient(username, api_id, api_hash) as client:
                entity = await client.get_entity(PeerChannel(channel_id))
                full_entity = await client(GetFullChannelRequest(channel=entity))

                names_dict[channel_id] = entity.title #get group name
                desc_dict[channel_id] = full_entity.full_chat.about #get group bio

                lista_mex = await client.get_messages(entity, reverse=True, offset_date=offset_date, limit=None)
    
    return lista_mex

In [None]:
second_edgelist_list = []
for codice in unique_values:
    try:
        list_messages = await get_data_channel(int(codice), offset_date, id_to_names, id_to_description)
        list_tuples = get_channels_forward(list_messages, codice)
        second_edgelist_list.extend(list_tuples)
    except(ChannelPrivateError, ValueError) as e:
        print(e)

    

In [102]:
forward_from = list(map(lambda x: x[0], second_edgelist_list))
forward_to = list(map(lambda x: x[1], second_edgelist_list))

In [119]:
second_edgelist_df_raw = pd.DataFrame({'forward_from':forward_from,\
    'forward_to':forward_to})

In [118]:
#save edgelist
second_edgelist = second_edgelist_df_raw.value_counts().reset_index()
second_edgelist.to_csv('../second_edgelist/second_edgelist.csv',index=False)

In [132]:
#need to cast keys as int (instead of int64) to dump json
id_to_names_int = dict(map(lambda x: (int(x[0]),x[1]),id_to_names.items()))
id_to_desc_int = dict(map(lambda x: (int(x[0]),x[1]),id_to_description.items()))

In [135]:
with open('../dictionaires/id_to_names.json','w') as file_names, \
    open('../dictionaires/id_to_desc.json','w') as file_desc:
    json.dump(id_to_names_int, file_names)
    json.dump(id_to_desc_int, file_desc)