In [None]:
import pandas as pd

# Function to count words in a string
def count_words(string):
    return len(string.split())

# Load data
data = pd.read_csv('CMV_July_2022.csv', index_col='id')

# Function to find the top-level comment of an entry
def find_top_level_comment(id):
    # If the entry does not exist or its parent_id starts with 't3', it is a top-level comment
    if id not in data.index or data.loc[id, 'parent_id'].startswith('t3'):
        return id

    # If not, find the top-level comment of the parent
    return find_top_level_comment(data.loc[id, 'parent_id'])

# Apply this function to each entry to get the top-level comment for each entry
data['top_level_comment_id'] = [find_top_level_comment(id) for id in data.index]

# Create a mapping from each top-level comment to all its descendants
top_level_comment_to_all_descendants = data.groupby('top_level_comment_id').groups

# Create a set of all top-level comments for which 'is_submitter' is true for at least one of their descendants
top_level_comments = set(data[data['is_submitter'] == True]['top_level_comment_id'])

# Create a set to store the IDs of all threads where the submitter responded
submitter_threads_full = set()

# Loop over each top-level comment
for top_level_comment_id in top_level_comments:
    # Add all the descendants of this top-level comment to submitter_threads_full
    submitter_threads_full.update(top_level_comment_to_all_descendants[top_level_comment_id])

# Filter the data to include only these threads
filtered_data_full = data[data.index.isin(submitter_threads_full)]

# Filter the data to include only top-level comments with at least 50 words
filtered_data_full = filtered_data_full[filtered_data_full.apply(lambda row: count_words(row['body']) >= 50 if row['parent_id'].startswith('t3') else True, axis=1)]


# Save the filtered data to a new CSV file
filtered_data_full.to_csv('filtered_data_pre_text_merge.csv')

In [None]:
#updated version


import pandas as pd

# Function to count words in a string
def count_words(string):
    return len(string.split())

# Load data
data = pd.read_csv('CMV_July_2022.csv', index_col='id')

# Function to find the top-level comment of an entry using the "find_top_level_comment_v5" approach
def find_top_level_comment_v5(id):
    while id in data.index and not data.loc[id, 'parent_id'].startswith('t3'):
        id = data.loc[id, 'parent_id']
    return id

# Apply this function to each entry to get the top-level comment for each entry
data['top_level_comment_id'] = [find_top_level_comment_v5(id) for id in data.index]

# Create a mapping from each top-level comment to all its descendants
top_level_comment_to_all_descendants = data.groupby('top_level_comment_id').groups

# Create a set of all top-level comments for which 'is_submitter' is true for at least one of their descendants
top_level_comments_with_submitter = set(data[data['is_submitter'] == True]['top_level_comment_id'])

# Filter data to keep only comments in threads where the top-level comment has at least 50 words
top_level_comments_50_words = set(data[data.apply(lambda row: count_words(row['body']) >= 50 if row['parent_id'].startswith('t3') else False, axis=1)].index)

# Create a set to store the IDs of all threads that meet both criteria
valid_threads = top_level_comments_with_submitter.intersection(top_level_comments_50_words)

# Create a set to store the IDs of all comments in these valid threads
valid_comments = set()
for top_level_comment_id in valid_threads:
    valid_comments.update(top_level_comment_to_all_descendants[top_level_comment_id])

# Filter the data to include only these comments
filtered_data_full = data[data.index.isin(valid_comments)]

# Save the filtered data to a new CSV file
filtered_data_full.to_csv('filtered_data.csv')
