In [15]:
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

class TrieNode:
    def __init__(self):
        self.children = defaultdict(TrieNode)
        self.is_end = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, conversation):
        node = self.root
        for tweet_id in conversation:
            node = node.children[tweet_id]
        node.is_end = True

    def is_subset(self, conversation):
        node = self.root
        for tweet_id in conversation:
            if tweet_id not in node.children:
                return False
            node = node.children[tweet_id]
        return node.is_end

def trace_conversation(start_tweet_id, tweet_dict):
    convo = []
    current_tweet_id = start_tweet_id
    users_in_conversation = set()
    local_processed_tweet_ids = set()  # Local set to track the current conversation
    while current_tweet_id:
        if current_tweet_id not in tweet_dict or current_tweet_id in local_processed_tweet_ids:
            break
        tweet_info = tweet_dict[current_tweet_id]
        convo.append(current_tweet_id)
        users_in_conversation.add(tweet_info['user_id'])
        local_processed_tweet_ids.add(current_tweet_id)
        if len(users_in_conversation) > 2:
            return None  # More than two users, not an exclusive conversation
        current_tweet_id = tweet_info['replied_tweet_id']
    return convo[::-1] if len(users_in_conversation) == 2 else None

def extract_and_filter_conversations(df):
    df.index = df.index.astype(int)  # Ensure tweet_id is treated as an integer
    df['replied_tweet_id'] = df['replied_tweet_id'].astype('Int64')  # Ensure replied_tweet_id is treated as an integer
    tweet_dict = df.to_dict('index')
    conversations = []
    longest_conversations = []

    # Start tracing conversations from tweets that are replies
    for tweet_id in tqdm(df[df['replied_tweet_id'].notnull()].index, desc="Extracting all conversations"):
        if conversation := trace_conversation(tweet_id, tweet_dict):
            longest = True
            for existing_convo in longest_conversations[:]:
                if set(conversation).issubset(set(existing_convo)):
                    longest = False
                    break
                elif set(existing_convo).issubset(set(conversation)):
                    longest_conversations.remove(existing_convo)
            if longest:
                longest_conversations.append(conversation)

    longest_conversations.sort(key=len, reverse=True)

    return longest_conversations

# Define the test data
test_cases = [
    {
        'name': 'Simple Conversation Between User and Airline',
        'data': {
            'tweet_id': [1, 2, 3, 4],
            'user_id': ['airline', 'user_a', 'airline', 'user_a'],
            'replied_tweet_id': [None, 1, 2, 3]
        },
        'expected': [[1, 2, 3, 4]]
    },
    {
        'name': 'User-Initiated Conversation',
        'data': {
            'tweet_id': [1, 2, 3],
            'user_id': ['user_a', 'airline', 'user_a'],
            'replied_tweet_id': [None, 1, 2]
        },
        'expected': [[1, 2, 3]]
    },
    {
        'name': 'More Than Two Users Involved',
        'data': {
            'tweet_id': [1, 2, 3, 4],
            'user_id': ['user_a', 'airline', 'user_b', 'airline'],
            'replied_tweet_id': [None, 1, 2, 3]
        },
        'expected': [[1, 2], [2, 3, 4]]
    },
    {
        'name': 'Conversation Branches Out',
        'data': {
            'tweet_id': [1, 2, 3, 4, 5],
            'user_id': ['airline', 'user_a', 'airline', 'user_a', 'user_b'],
            'replied_tweet_id': [None, 1, 2, 3, 2]
        },
        'expected': []
    },
    {
        'name': 'Non-Reply Initial Tweet by Airline',
        'data': {
            'tweet_id': [1, 2],
            'user_id': ['airline', 'user_a'],
            'replied_tweet_id': [None, 1]
        },
        'expected': [[1, 2]]
    },
    {
        'name': 'Non-Reply Initial Tweet by User',
        'data': {
            'tweet_id': [1, 2],
            'user_id': ['user_a', 'airline'],
            'replied_tweet_id': [None, 1]
        },
        'expected': [[1, 2]]
    },
    {
        'name': 'Conversation with Duplicate Tweets',
        'data': {
            'tweet_id': [1, 2, 3, 2],
            'user_id': ['airline', 'user_a', 'airline', 'user_a'],
            'replied_tweet_id': [None, 1, 2, 3]
        },
        'expected': [[1, 2, 3, 2]]
    }
]

# Run the test cases
for test_case in test_cases:
    df = pd.DataFrame(test_case['data']).set_index('tweet_id')
    result = extract_and_filter_conversations(df)
    assert result == test_case['expected'], f"Test case '{test_case['name']}' failed: expected {test_case['expected']}, got {result}"
    print(f"Test case '{test_case['name']}' passed.")


Extracting all conversations: 100%|██████████| 3/3 [00:00<?, ?it/s]


Test case 'Simple Conversation Between User and Airline' passed.


Extracting all conversations: 100%|██████████| 2/2 [00:00<?, ?it/s]


Test case 'User-Initiated Conversation' passed.


Extracting all conversations: 100%|██████████| 3/3 [00:00<?, ?it/s]


AssertionError: Test case 'More Than Two Users Involved' failed: expected [[1, 2], [2, 3, 4]], got [[1, 2]]