# Load, Sample and Add english translation of messaged

In [2]:
import pandas as pd
import numpy as np
import json
# from deep_translator import GoogleTranslator

SAMPLE_SIZE = 100

# def translate(x):
#     try:
#         tranlsation = GoogleTranslator(source='auto', target='en').translate(x)
#     except:
#         tranlsation = ""
#     return tranlsation

def total_interactions(x):
    reactions_total = 0
    for _, value in json.loads(x.replace("'", '"')).items():
        reactions_total += value
    return reactions_total

def weighted_popularity_score(row):
    return np.mean([row.views, row.forwards, row.reaction_nb])

def min_max_normalization(col, df):
    return (df[col]-df[col].min())/(df[col].max()-df[col].min())


df = pd.read_csv('../../data/telegram/telegram.csv')
df['reaction_nb'] = df.reactions.apply(lambda x: total_interactions(x))

df["views_norm"] = min_max_normalization("views", df)
df["forwards_norm"] = min_max_normalization("forwards", df)
df["reaction_nb_norm"] = min_max_normalization("reaction_nb", df)

df['popularity_score'] = df.apply(lambda x: weighted_popularity_score(x), axis=1)
df = df.sort_values('popularity_score', ascending=False)

sampled = df.iloc[:SAMPLE_SIZE]
# sampled['englishMessageText'] = sampled.messageText.apply(lambda x: translate(x))
# sampled.to_csv('../../data/telegram/sample_with_translations.csv', index=False)

In [None]:
df.messageDatetime.min()

In [None]:
sampled[sampled.messageText.str.contains('Какой')]

In [None]:
sampled.chat.value_counts()

In [None]:
sampled.shape

In [None]:
sampled[['chat', 'messageText']].to_dict()

# Request propaganda detection for sample messages

In [None]:
import asyncio
import websockets
import json
import time
from tqdm import tqdm

async def connect_to_websocket(dataframe, client_id, message, progress_bar, semaphore):
    # Use a semaphore to limit the number of parallel connections
    async with semaphore:
        try:
            async with websockets.connect(
                "ws://13.48.71.178:8000/ws/analyze_propaganda", 
                ping_interval=None  # No timeout on ping
            ) as websocket:
                # Define the request data
                request_data = {
                    "model_name": "gpt-4o",  # Example model
                    "text": message,
                    "contextualize": "True"
                }

                # Send the request data
                await websocket.send(json.dumps(request_data))
                print(f"Client {client_id}: Request sent.")

                # Keep receiving messages until the server closes the connection
                while True:
                    try:
                        response = await websocket.recv()  # Wait for the next message from the server
                        print(f"Client {client_id}: Response received:\n{response}")
                        dataframe.at[client_id, "response"] = response  # Save response in dataframe
                    except websockets.ConnectionClosedOK:
                        # Server closed the connection gracefully
                        print(f"Client {client_id}: Connection closed OK.")
                        break
                    except websockets.ConnectionClosedError:
                        # Server closed with an error
                        print(f"Client {client_id}: Connection closed with an error.")
                        break
                    except Exception as e:
                        print(f"Client {client_id}: An error occurred: {e}")
                        break

        except Exception as e:
            print(f"Client {client_id}: An error occurred: {e}")
        finally:
            # Update the progress bar when a client request is complete
            progress_bar.update(1)

async def simulate_multiple_clients(dataframe, parallel_connections):
    tasks = []
    
    # Create a semaphore to limit parallel connections to 'parallel_connections'
    semaphore = asyncio.Semaphore(parallel_connections)
    
    # Create a tqdm progress bar for tracking finished tasks
    with tqdm(total=len(dataframe), desc="Finished Requests") as progress_bar:
        for i, row in enumerate(dataframe.itertuples()):
            tasks.append(connect_to_websocket(dataframe, i, row.messageText, progress_bar, semaphore))
        
        # Wait for all tasks to complete (i.e., wait for WebSocket closure for each client)
        await asyncio.gather(*tasks)

# Example dataframe
# Replace 'sampled' with your actual dataframe
parallel_connections = 5  # You can change this value to control how many requests run in parallel
await simulate_multiple_clients(sampled, parallel_connections)

# Save the dataframe after processing
sampled.to_csv('../../data/telegram/best_100_detection.csv', index=False)


In [None]:
df

In [None]:
def unfold_results(x):
    obj = json.loads(x['response'])
    for key, value in obj['data'].items():
        x[key] = 1

    return x

new = sampled.apply(lambda x: unfold_results(x), axis=1)
new.fillna(0, inplace=True)
new['is_propaganda'] = new[['Appeal_to_Authority', 'Appeal_to_fear-prejudice',
       'Bandwagon, Reductio_ad_hitlerum', 'Black-and-White_Fallacy',
       'Causal_Oversimplification', 'Doubt', 'Exaggeration, Minimization',
       'Flag-Waving', 'Loaded_Language', 'Name_Calling, Labeling',
       'Repetition', 'Slogans', 'Whataboutism, Straw_Men, Red_Herring']].sum(axis=1)
new['is_propaganda'] = new['is_propaganda']>0
print(new['is_propaganda'].value_counts())
new.to_csv('../../data/telegram/unfolded.csv')

# Telethon

In [None]:
# Import libraries
import os
from dotenv import load_dotenv
from telethon import TelegramClient
from telethon.errors import SessionPasswordNeededError
from tqdm.notebook import tqdm
import getpass
from IPython.display import clear_output
import asyncio
from telethon.tl.types import Message
import inspect

# Option 1: Load environment variables from .env file
load_dotenv()
TELEGRAM_API_ID = os.getenv("TELEGRAM_API_ID")
TELEGRAM_API_HASH = os.getenv("TELEGRAM_API_HASH")

# Option 2: Directly set API credentials (Uncomment if not using .env)
# TELEGRAM_API_ID = 'your_api_id'
# TELEGRAM_API_HASH = 'your_api_hash'

# Initialize the Telegram client
client = TelegramClient('simple_session', TELEGRAM_API_ID, TELEGRAM_API_HASH)

# Authentication function
async def authenticate_client():
    await client.connect()
    
    if not await client.is_user_authorized():
        phone = input("Enter your phone number (with country code, e.g., +123456789): ")
        await client.send_code_request(phone)
        code = input("Enter the code you received: ")
        try:
            await client.sign_in(phone, code)
        except SessionPasswordNeededError:
            password = getpass.getpass("Two-step verification enabled. Enter your password: ")
            await client.sign_in(password=password)
        clear_output()
        print("Authentication successful!")
    else:
        print("Client is already authorized.")

# Run authentication
await authenticate_client()

# Function to print message attributes
def print_message_attributes(chat, limit=10):
    """
    Fetches and prints all attributes of the first `limit` messages from a specified chat.

    :param chat: The username or ID of the Telegram chat to scrape.
    :param limit: Number of messages to retrieve.
    """
    async def fetch_messages():
        print(f'\nScraping the first {limit} messages from chat: {chat}\n')
        try:
            # Fetch messages asynchronously
            messages = await client.iter_messages(chat, limit=limit).to_list()
            
            for idx, message in enumerate(messages, start=1):
                print(f"--- Message {idx} ---")
                # Get all attributes of the message
                attributes = {attr: getattr(message, attr) for attr in dir(message) 
                              if not attr.startswith('_') and not inspect.ismethod(getattr(message, attr))}
                for key, value in attributes.items():
                    print(f"{key}: {value}")
                print("\n")
        except Exception as e:
            print(f"An error occurred: {e}")

    # Run the asynchronous fetch
    asyncio.run(fetch_messages())

# Specify the chat to scrape
chat_to_scrape = 'https://t.me/opersvodki'  # Replace with your target chat https://t.me/medvedev_telegram

# Fetch and print message attributes
print_message_attributes(chat_to_scrape, limit=10)


In [None]:
from telethon.sync import TelegramClient
import os

# Your API ID and API Hash from my.telegram.org
TELEGRAM_API_ID = os.getenv("TELEGRAM_API_ID")
TELEGRAM_API_HASH = os.getenv("TELEGRAM_API_HASH")
# Peer ID of the Telegram channel
peer_id = '1315735637'

# Connect to the Telegram client
client = TelegramClient('session_name', TELEGRAM_API_ID, TELEGRAM_API_HASH)

entity = client.get_entity(peer_id)

    # Print the channel name
print('Channel Name:', entity.title)


In [None]:
import os
import asyncio
from telethon import TelegramClient
from telethon.tl.types import PeerChannel

# Fetch API credentials from environment variables and convert API_ID to integer
TELEGRAM_API_ID = int(os.getenv("TELEGRAM_API_ID"))
TELEGRAM_API_HASH = os.getenv("TELEGRAM_API_HASH")

# Peer ID of the Telegram channel (ensure it's an integer)
peer_id = 1315735637  # Remove quotes to make it an integer

async def main():
    # Initialize the Telegram client with a unique session name
    async with TelegramClient('unique_session_name', TELEGRAM_API_ID, TELEGRAM_API_HASH) as client:
        try:
            # Retrieve the entity using the peer ID
            entity = await client.get_entity(PeerChannel(peer_id))
            
            # Print the channel name
            print('Channel Name:', entity.title)
        except ValueError:
            print('Invalid peer ID or entity not found')
        except Exception as e:
            print(f'An error occurred: {e}')

# Run the asynchronous main function
if __name__ == "__main__":
    asyncio.run(main())


In [None]:
import os
import asyncio
import logging
from telethon import TelegramClient
from telethon.tl.types import PeerChannel

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set to DEBUG for more detailed logs
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Fetch API credentials from environment variables and convert API_ID to integer
TELEGRAM_API_ID = os.getenv("TELEGRAM_API_ID")
TELEGRAM_API_HASH = os.getenv("TELEGRAM_API_HASH")

# Validate API credentials
if not TELEGRAM_API_ID or not TELEGRAM_API_HASH:
    logger.error("TELEGRAM_API_ID and TELEGRAM_API_HASH must be set as environment variables.")
    exit(1)

try:
    TELEGRAM_API_ID = int(TELEGRAM_API_ID)
except ValueError:
    logger.error("TELEGRAM_API_ID must be an integer.")
    exit(1)

# Peer ID of the Telegram channel (ensure it's an integer)
peer_id = 1315735637  # Ensure this is the correct integer ID

async def main():
    # Initialize the Telegram client with a unique session name
    try:
        async with TelegramClient('unique_session_name', TELEGRAM_API_ID, TELEGRAM_API_HASH) as client:
            logger.info("Client started successfully.")

            try:
                # Retrieve the entity using the peer ID
                entity = await client.get_entity(PeerChannel(peer_id))
                
                # Print the channel name
                print('Channel Name:', entity.title)
                logger.info(f'Channel Name: {entity.title}')
            except ValueError:
                logger.error('Invalid peer ID or entity not found.')
            except Exception as e:
                logger.exception(f'An unexpected error occurred: {e}')

    except Exception as e:
        logger.exception(f'Failed to initialize TelegramClient: {e}')

# Run the asynchronous main function
if __name__ == "__main__":
    asyncio.run(main())


In [None]:
async with TelegramClient('SessionName', TELEGRAM_API_ID, TELEGRAM_API_HASH) as client:
    # Fetch the entity using the peer ID
    entity = await client.get_entity(PeerChannel(1391419522))
    
    # Print the channel name
    print('Channel Name:', entity.title)

In [None]:
PeerChannel(1315735637)

# Analyse network between channels

In [3]:
import pandas as pd

df = pd.read_csv('../../data/telegram/messages_scraped.csv')
print("size df total", df.shape)
# Convert 'messageDate' column to datetime format
df['messageDate'] = pd.to_datetime(df['messageDate'])

# Filter rows with 'messageDate' after 2022
df = df[df['messageDate'] > '2022-01-01']
df['reaction_nb'] = df.reactions.apply(lambda x: total_interactions(x))

df["views_norm"] = min_max_normalization("views", df)
df["forwards_norm"] = min_max_normalization("forwards", df)
df["reaction_nb_norm"] = min_max_normalization("reaction_nb", df)

df['popularity_score'] = df.apply(lambda x: weighted_popularity_score(x), axis=1)
# df = df.sort_values('popularity_score', ascending=False)
print("size df after 2022", df.shape)

  df = pd.read_csv('../../data/telegram/messages_scraped.csv')


size df total (1623145, 39)
size df after 2022 (1275438, 44)


In [21]:
peer_id_to_chat = df[['peer_id', 'chat']].drop_duplicates().set_index('peer_id')['chat'].to_dict()
df['peer_id_name'] = df['peer_id'].map(peer_id_to_chat).fillna('unknown')
df['peer_id_name_fwd_from'] = df['fwd_from'].map(peer_id_to_chat).fillna('unknown')
df_fwd_known = df[df.peer_id_name_fwd_from != 'unknown']
df_fwd_known.sort_values('popularity_score', ascending=False, inplace=True)
print("size df with fwd_from known", df_fwd_known.shape)


size df with fwd_from known (23503, 46)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fwd_known.sort_values('popularity_score', ascending=False, inplace=True)


In [23]:
fwd_25 = df_fwd_known.messageText[:2].values

In [25]:
df_fwd_known[df_fwd_known.messageText.isin(fwd_25)]

2

In [29]:
df_25[["fwd_from", "messageText", "peer_id_name_fwd_from", "peer_id_name"]]

Unnamed: 0,fwd_from,messageText,peer_id_name_fwd_from,peer_id_name
10,,Об американских стратегах\n \n1. У Америки ...,unknown,https://t.me/medvedev_telegram
411306,1572749000.0,Об американских стратегах\n \n1. У Америки ...,https://t.me/medvedev_telegram,https://t.me/OpenUkraine
640045,1572749000.0,Об американских стратегах\n \n1. У Америки ...,https://t.me/medvedev_telegram,https://t.me/vzglyad_ru
902760,1355541000.0,‼️💥Чудовищный взрыв в ЛНР: Взрывную волну ощут...,https://t.me/RVvoenkor,https://t.me/SolovievLive
902761,1572749000.0,Об американских стратегах\n \n1. У Америки ...,https://t.me/medvedev_telegram,https://t.me/SolovievLive


In [24]:
# filter df based on the top 25 messages
df_25 = df[df.messageText.isin(fwd_25)]
print("size df_25", df_25.shape)

size df_25 (5, 46)


In [None]:
import pandas as pd
import networkx as nx
from pyvis.network import Network
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Define the number of messages to observe
MESSAGE_OBSERVED = 500

# Step 3: Filter the DataFrame to include only the top MESSAGE_OBSERVED messages by 'forwards'
df_max_forwards = df_25.sort_values('forwards', ascending=False).iloc[:MESSAGE_OBSERVED]

# Step 1: Create the peer_id_to_chat dictionary
peer_id_to_chat = df[['peer_id', 'chat']].drop_duplicates().set_index('peer_id')['chat'].to_dict()

# Step 2: Assign unique colors to each unique chat
unique_chats = set(peer_id_to_chat.values())
# Do not discard "unknown name" to handle it separately

# Choose a colormap
cmap = plt.get_cmap('tab20')  # 'tab20' has 20 distinct colors
num_colors = len(unique_chats)

# If number of unique chats exceeds the colormap, use another colormap or extend the existing one
if num_colors > cmap.N:
    cmap = plt.get_cmap('hsv')  # 'hsv' can provide a wide range of colors
    colors = [mcolors.rgb2hex(cmap(i / num_colors)) for i in range(num_colors)]
else:
    colors = [mcolors.rgb2hex(cmap(i)) for i in range(num_colors)]

# Create a dictionary mapping chat names to colors
chat_to_color = {chat: colors[i] for i, chat in enumerate(unique_chats)}

# Assign a default color for nodes with unknown names
default_node_color = '#C0C0C0'  # Silver



# Step 4: Further filter to include only forwarded messages
forwarded_messages = df_max_forwards[df_max_forwards['fwd_from'].notna()]

# Step 5: Initialize a directed graph
G = nx.DiGraph()

# Step 6: Iterate over each forwarded message to add edges to the graph
for index, row in forwarded_messages.iterrows():
    source_peer_id = row['fwd_from']
    target_peer_id = row['peer_id']
    message_id = row['id']
    message_text = row['messageText'] if pd.notna(row['messageText']) else ''
    message_date = row['messageDate']
    
    # Get chat names or use 'unknown name'
    source_chat = peer_id_to_chat.get(source_peer_id, "unknown name")
    target_chat = peer_id_to_chat.get(target_peer_id, "unknown name")
    
    # Use peer_id as unique identifier (converted to string)
    source_node = str(source_peer_id)
    target_node = str(target_peer_id)
    
    # Determine node colors based on chat names
    source_color = chat_to_color.get(source_chat, default_node_color)
    target_color = chat_to_color.get(target_chat, default_node_color)
    
    # Add nodes with label as chat names and assigned colors
    if not G.has_node(source_node):
        G.add_node(source_node, label=source_chat, color=source_color)
    if not G.has_node(target_node):
        G.add_node(target_node, label=target_chat, color=target_color)
    
    # Add an edge from source to target with message details as edge attributes
    G.add_edge(
        source_node,
        target_node,
        message_id=message_id,
        text=message_text,
        date=message_date
    )

# Step 7: Create a PyVis Network
net = Network(height='750px', width='100%', notebook=True, directed=True)

# Customize the physics layout (optional for better visualization)
net.force_atlas_2based()

# Add nodes with labels and colors
for node, data in G.nodes(data=True):
    label = data.get('label', 'unknown name')
    color = data.get('color', default_node_color)
    net.add_node(
        node,
        label=label,
        title=label,  # Tooltip on hover
        color=color,  # Node color based on chat
        size=15
    )

# Add edges with tooltips
for source, target, data in G.edges(data=True):
    message_id = data.get('message_id', '')
    text = data.get('text', '')
    date = data.get('date', '')
    tooltip = f"<b>Message ID:</b> {message_id}<br><b>Date:</b> {date}<br><b>Text:</b> {text}"
    
    net.add_edge(
        source,
        target,
        title=tooltip  # Tooltip on hover
        # No color assigned for edges
    )

# Optionally, enable additional features like showing physics controls
net.show_buttons(filter_=['physics'])

# Generate and save the interactive graph to an HTML file
net.show(f'interactive_graph.html')
