In [1]:
import pandas as pd
import numpy as np
from telethon.sync import TelegramClient
from telethon.tl.types import PeerUser, PeerChat, PeerChannel, InputMessagesFilterPhotoVideo
from telethon.tl.functions.channels import GetFullChannelRequest
import json
import re
from telethon.errors import ChannelPrivateError
import tqdm
import glob
import pickle

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
with open('../credentials/credentials.pickle','rb') as file:
    api_id, api_hash, phone, username = pickle.load(file)

In [3]:
first_edgelist = pd.read_csv('../first_edgelist/first_edgelist.csv')

In [4]:
second_edgelist = pd.read_csv('../second_edgelist/second_edgelist.csv')

In [5]:
edgelist_total = pd.concat([first_edgelist, second_edgelist])

In [6]:
unique_values = np.setdiff1d(second_edgelist['forward_from'], edgelist_total['forward_to'])
#get only values from second iteration that are NOT in the total edgelist 'forward_to' column 
#because we have already done every 'forward_to' channel

In [7]:
import datetime

In [8]:
offset_date = datetime.datetime(2023,10,1)

In [9]:
def get_channels_forward(lista_messaggi, receving_id):
    """
    lista_messaggi: list of patched telethon messages
    receving_id: id of the scraped group. So the forwarded_to id

    returns list of tuple with format (forwarded_from, forwarded_to)
    where forwarded_to is the receving_id

    """
    lista_fwd = []
    for mex in lista_messaggi:
        dict_mex = mex.to_dict()
        if 'fwd_from' in dict_mex.keys() and dict_mex['fwd_from'] is not None:
            fwd_info = dict_mex['fwd_from']['from_id']
            if type(fwd_info) == dict and fwd_info['_']=='PeerChannel':
                lista_fwd.append((fwd_info['channel_id'], receving_id))

    return lista_fwd
        

get dictionaires so we can upload them

In [10]:
with open('../dictionaires/id_to_names.json', 'r') as file:
    id_to_names = json.load(file)

with open('../dictionaires/id_to_desc.json', 'r') as file:
    id_to_desc = json.load(file)

In [12]:
id_to_names = dict(map(lambda x: (int(x[0]), x[1]),id_to_names.items())) #cast into integer
id_to_desc = dict(map(lambda x: (int(x[0]), x[1]),id_to_desc.items()))

In [15]:
async def get_data_channel(channel_id, offset_date, names_dict, desc_dict):
    """
    Get all messages from channel NEWER than offset date (e.g. sent after offset date)

    args:

    channel_id: id of channel to be scraped (has to be an integer!)
    offset_date: date limit 
    names_dict: dict of id-to-names
    desc_dict: dict of id-to-bio
    
    return list of patcthed messages
    """
    
    async with TelegramClient(username, api_id, api_hash) as client:
                entity = await client.get_entity(PeerChannel(channel_id))
                full_entity = await client(GetFullChannelRequest(channel=entity))

                names_dict[channel_id] = entity.title #get group name
                desc_dict[channel_id] = full_entity.full_chat.about #get group bio

                lista_mex = await client.get_messages(entity, reverse=True, offset_date=offset_date, limit=None)
    
    return lista_mex

# check if there are temporary files:

In [16]:
if glob.glob('../temporary_data/*.pickle'):
    with open('../temporary_data/temporary_third_edgelist.pickle','rb') as file_edgelist,\
        open('../temporary_data/temporary_id_to_names.pickle','rb') as file_id_names,\
        open('../temporary_data/temporary_id_to_desc.pickle','rb') as file_id_desc,\
        open('../temporary_data/temporary_index.pickle','rb') as index_file:

            third_edgelist_list = pickle.load(file_edgelist)
            id_to_names = pickle.load(file_id_names)
            id_to_desc = pickle.load(file_id_desc)
            index = pickle.load(index_file)

else:
    third_edgelist_list = []
    index=0
    #dictionaires would be already saved as variables because of the other cells


In [18]:
start_value = index
final_value = len(unique_values)
for index in tqdm.tqdm(range(start_value, final_value)):
    codice = unique_values[index]
    try:
        list_messages = await get_data_channel(int(codice), offset_date, id_to_names, id_to_desc)
        list_tuples = get_channels_forward(list_messages, codice)
        third_edgelist_list.extend(list_tuples)
    except(ChannelPrivateError, ValueError) as e:
        print(e)

 19%|█▊        | 45/243 [28:02<40:44, 12.35s/it]   

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 40%|███▉      | 96/243 [44:11<28:11, 11.51s/it]  

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 42%|████▏     | 101/243 [45:12<16:28,  6.96s/it]

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 59%|█████▉    | 144/243 [56:47<18:28, 11.20s/it]  

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 65%|██████▌   | 158/243 [1:00:27<18:40, 13.18s/it]

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 70%|██████▉   | 169/243 [1:01:54<09:28,  7.69s/it]

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 75%|███████▌  | 183/243 [1:10:23<34:47, 34.79s/it]

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 76%|███████▌  | 184/243 [1:10:24<24:00, 24.42s/it]

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 88%|████████▊ | 213/243 [1:20:11<09:46, 19.56s/it]

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 91%|█████████ | 221/243 [1:21:48<04:33, 12.45s/it]

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


 96%|█████████▌| 233/243 [1:22:47<00:34,  3.41s/it]

The channel specified is private and you lack permission to access it. Another reason may be that you were banned from it (caused by GetChannelsRequest)


100%|██████████| 243/243 [1:24:41<00:00, 20.91s/it]


<details> 
<summary>code to execute if we interrupt the iteration</summary>


``` python
with open('./temporary_data/temporary_third_edgelist.pickle','wb') as file_edgelist,\
    open('./temporary_data/temporary_id_to_names.pickle','wb') as file_id_names,\
        open('./temporary_data/temporary_id_to_desc.pickle','wb') as file_id_desc,\
        open('./temporary_data/temporary_index.pickle','wb') as index_file:

        pickle.dump(third_edgelist_list, file_edgelist)
        pickle.dump(id_to_names, file_id_names)
        pickle.dump(id_to_desc, file_id_desc)
        pickle.dump(index, index_file)
```

</details>


In [86]:
with open('../temporary_data/temporary_third_edgelist.pickle','wb') as file_edgelist,\
    open('../temporary_data/temporary_id_to_names.pickle','wb') as file_id_names,\
        open('../temporary_data/temporary_id_to_desc.pickle','wb') as file_id_desc,\
        open('../temporary_data/temporary_index.pickle','wb') as index_file:

        pickle.dump(third_edgelist_list, file_edgelist)
        pickle.dump(id_to_names, file_id_names)
        pickle.dump(id_to_desc, file_id_desc)
        pickle.dump(index, index_file)

In [19]:
forward_from = list(map(lambda x: x[0], third_edgelist_list))
forward_to = list(map(lambda x: x[1], third_edgelist_list))

In [20]:
third_edgelist_df_raw = pd.DataFrame({'forward_from':forward_from,\
    'forward_to':forward_to})

In [21]:
third_edgelist = third_edgelist_df_raw.value_counts().reset_index()

In [22]:
third_edgelist

Unnamed: 0,forward_from,forward_to,count
0,1235779695,1235779695,2098
1,1840238787,1235779695,1116
2,1293223450,1006487902,890
3,1320598051,1235779695,880
4,1220318149,1300269873,877
...,...,...,...
8595,1472802395,1458480018,1
8596,1472802395,1283479526,1
8597,1472802395,1229525984,1
8598,1472802395,1150849367,1


In [23]:
third_edgelist.to_csv('../third_edgelist/third_edgelist.csv', index=False)

In [24]:
id_to_names_int = dict(map(lambda x: (int(x[0]),x[1]),id_to_names.items()))
id_to_desc_int = dict(map(lambda x: (int(x[0]),x[1]),id_to_desc.items()))

In [25]:
with open('../dictionaires/id_to_names.json','w') as file_names, \
    open('../dictionaires/id_to_desc.json','w') as file_desc:
    json.dump(id_to_names_int, file_names)
    json.dump(id_to_desc_int, file_desc)