In [5]:
#Create Comment Trees
import pandas as pd

def process_comments(data):
    """
    Process the comments data to assign top-level IDs to each comment.
    
    Parameters:
        - data (DataFrame): The comments data.
        
    Returns:
        - DataFrame: The processed data with top-level IDs.
    """
    # Create a dictionary mapping each comment's id to its parent_id
    id_to_parent = dict(zip(data['id'], data['parent_id']))
    
    def find_top_level_id_fast(comment_id):
        """
        Find the top-level ID for a given comment ID.
        
        Parameters:
            - comment_id (str): The comment ID.
            
        Returns:
            - str: The top-level comment ID.
        """
        current_id = comment_id
        current_parent_id = id_to_parent.get(current_id, None)
        
        # Keep iterating until we find a top-level comment
        while current_parent_id and not current_parent_id.startswith("t3"):
            current_id = current_parent_id[3:]
            current_parent_id = id_to_parent.get(current_id, None)
        
        return current_id
    
    # Initialize the top_level_id column with NaN values
    data['top_level_id'] = None

    # For each entry, if it's a top-level comment, assign its own id to top_level_id
    data.loc[data['parent_id'].str.startswith("t3"), 'top_level_id'] = data['id']

    # Apply the function to find top_level_id for each non-top-level entry
    non_top_level_rows = data[data['top_level_id'].isnull()]
    data.loc[non_top_level_rows.index, 'top_level_id'] = non_top_level_rows['id'].apply(find_top_level_id_fast)
    
    return data

if __name__ == "__main__":
    # Load the data (replace with your file path)
    data = pd.read_csv('../Data/CMV_July_2022.csv')
    
    # Process the data
    processed_data = process_comments(data)
    
    # Display a sample
    print(processed_data[['id', 'parent_id', 'top_level_id']].sample(10))


             id   parent_id top_level_id
3635    ieky4fz  t1_iekp6ov      iegy0zj
106259  ih6adjx  t1_ih2di5f      ih2di5f
115543  ihhbbfn  t1_ihgzhdi      ihgzhdi
75382   igeb4z0   t3_w0eiqa      igeb4z0
62680   ifzkyh0  t1_ifzk23b      ifzk23b
95954   igwdkm6   t3_w3flnd      igwdkm6
107226  ih7hdtx  t1_ih776yp      ih776yp
24648   if2eomw   t3_vsn3ys      if2eomw
75459   igeed07  t1_ige92fn      ige92fn
131941  ihvy0i0  t1_ihu03xg      ihtzxpv


In [6]:
#Filter to only include where OP responded

def filter_by_submitter(data):
    """
    Filters the data to include only those comment trees where at least one 
    of the entries has "is_submitter" set to true.
    
    Parameters:
        - data (DataFrame): The comments data.
        
    Returns:
        - DataFrame: The filtered data.
    """
    # Identify top-level IDs where at least one of the entries has "is_submitter" as true
    top_level_ids_with_submitter = data[data['is_submitter'] == True]['top_level_id'].unique()
    
    # Filter the dataset to include only these comment trees
    filtered_data = data[data['top_level_id'].isin(top_level_ids_with_submitter)]
    
    return filtered_data

# Apply the filter
data_filtered_by_submitter = filter_by_submitter(data)

# Display a sample
data_filtered_by_submitter.sample(5)


Unnamed: 0.1,Unnamed: 0,id,link_id,parent_id,CreatedAt,author,ups,downs,score,edited,...,subreddit_id,stickied,is_submitter,body,error,subreddit,Month,Year,Day,top_level_id
122145,122145,ihmlqmz,t3_w7u25q,t1_ihmkvni,2022-07-25 13:50:22,PsychDoctorate,,,1,False,...,t5_2w2s8,False,False,You were able to get puberty blockers or hormo...,,changemyview,7,2022,25,ihlt2qw
4604,4604,iele2t2,t3_vpsuwv,t1_ield7hf,2022-07-02 09:37:14,[deleted],,,1,False,...,t5_2w2s8,False,False,[deleted],,changemyview,7,2022,2,iel48z7
65882,65882,ig2u87i,t3_vyiy2d,t1_ig2tz6v,2022-07-13 19:43:11,Charlie-Wilbury,,,1,False,...,t5_2w2s8,False,False,&gt;But yeah an adult has much more freedom bc...,,changemyview,7,2022,13,ig2hmrm
111224,111224,ihcuzfs,t3_w4f2yh,t1_ihcrfgn,2022-07-23 12:18:19,21CenturyNephilim,,,1,False,...,t5_2w2s8,False,False,You're ticking off all the reddit midwit sperg...,,changemyview,7,2022,23,ih1hxg9
40164,40164,iffcrzy,t3_vu14be,t1_iffcop4,2022-07-08 19:25:07,beeberweeber,,,2,False,...,t5_2w2s8,False,False,Socialist Marxist Leninism Keynesian Milton fr...,,changemyview,7,2022,8,ifaxxdv


In [7]:
#Filter to only include posts with at least 30 words in top-level comment

def filter_by_word_count(data, threshold=30):
    """
    Filters the data to include only those comment trees where the top-level 
    comment contains at least a specified threshold of words in the "body" field.
    
    Parameters:
        - data (DataFrame): The comments data.
        - threshold (int): The minimum word count for top-level comments.
        
    Returns:
        - DataFrame: The filtered data.
    """
    # Identify top-level comments
    top_level_comments = data[data['parent_id'].str.startswith("t3")]
    
    # Count words in the body field of top-level comments
    word_counts = top_level_comments['body'].str.split().apply(len)
    
    # Identify top-level IDs where the body has at least 'threshold' words
    top_level_ids_with_threshold_words = top_level_comments[word_counts >= threshold]['id']
    
    # Filter the dataset to include only these comment trees
    filtered_data = data[data['top_level_id'].isin(top_level_ids_with_threshold_words)]
    
    return filtered_data

# Apply the filter
data_filtered_by_word_count = filter_by_word_count(data)

# Display a sample
data_filtered_by_word_count.sample(5)


Unnamed: 0.1,Unnamed: 0,id,link_id,parent_id,CreatedAt,author,ups,downs,score,edited,...,subreddit_id,stickied,is_submitter,body,error,subreddit,Month,Year,Day,top_level_id
26003,26003,if34mk5,t3_vsh0gn,t1_if2szz4,2022-07-06 08:41:28,phenix717,,,1,False,...,t5_2w2s8,False,False,I think the massage thing is good precisely fo...,,changemyview,7,2022,6,if1qikm
114023,114023,ihftv6d,t3_w6m1yx,t1_ihfo6zq,2022-07-24 05:10:07,Rodulv,,,-1,False,...,t5_2w2s8,False,False,&gt; why would it be a good thing to do what y...,,changemyview,7,2022,23,iheoanq
109506,109506,ihbl7y2,t3_w641i3,t1_ihbkimb,2022-07-23 06:43:48,LouisaEveryday,,,0,False,...,t5_2w2s8,False,True,You are confusing parental neglect with parent...,,changemyview,7,2022,22,ihbkimb
3268,3268,iekhcd7,t3_vp8lwj,t3_vp8lwj,2022-07-02 05:07:13,mac_128,,,1,False,...,t5_2w2s8,False,False,"Personally, the lifestyle in the military is p...",,changemyview,7,2022,1,iekhcd7
80364,80364,igit6fa,t3_w185vn,t1_igistkc,2022-07-17 07:40:27,ProLifePanda,,,10,False,...,t5_2w2s8,False,False,&gt;Have you ever tried to have sex with a col...,,changemyview,7,2022,17,igiq39h


In [8]:
#code for identifying deltas

# Create a new column to indicate if a comment received a delta
data['received_delta'] = False

# Identify delta comments
delta_comments = data[data['body'].str.contains('!delta|Δ', na=False, case=False, regex=True)]

# Mark comments that received a delta based on their parent comment
data.loc[data['id'].isin(delta_comments['parent_id'].str.replace("t1_", "")), 'received_delta'] = True

print(data[data['received_delta']].head(5))


      Unnamed: 0       id    link_id   parent_id            CreatedAt  \
51            51  iegehw5  t3_vouzwo  t1_ieff04p  2022-07-01 07:24:16   
149          149  iegid5p  t3_vouzwo   t3_vouzwo  2022-07-01 07:51:10   
285          285  iegon8d  t3_vp2cm8  t1_iegnggu  2022-07-01 08:34:04   
436          436  iegtx9p  t3_vn1v0q  t1_iegqui0  2022-07-01 09:09:15   
1677        1677  iehqjck  t3_vp4op3  t1_iehop4t  2022-07-01 12:51:42   

                  author  ups  downs  score  edited  ... stickied  \
51          LooseBar2222  NaN    NaN      4   False  ...    False   
149     mynewaccount4567  NaN    NaN      5   False  ...    False   
285          countrymace  NaN    NaN      2   False  ...    False   
436   littlesmilinghooks  NaN    NaN      1   False  ...    False   
1677           [deleted]  NaN    NaN      3    True  ...    False   

     is_submitter                                               body  error  \
51          False  I think it's all about perception by the next ..

In [9]:
# Define the id_to_row dictionary for quick look-up of comments by their ID
id_to_row = data.set_index('id').to_dict(orient='index')

# Re-calculating ancestors for each comment
data['ancestors'] = data['id'].apply(lambda x: get_ancestors(x, id_to_parent))

# Proceed to consolidate the SC comments using the pre-calculated ancestors

consolidated_entries = []

for index, row in data[data['received_delta']].iterrows():
    sc = row['author']
    delta_comment_id = row['id']
    combined_text = [row['body']]
    earliest_date = pd.to_datetime(row['CreatedAt'])
    latest_date = pd.to_datetime(row['CreatedAt'])
    
    for ancestor_id in row['ancestors']:
        if ancestor_id not in id_to_row:
            continue
        ancestor_comment = id_to_row[ancestor_id]
        if ancestor_comment['author'] == sc:
            combined_text.append(ancestor_comment['body'])
            parent_date = pd.to_datetime(ancestor_comment['CreatedAt'])
            if parent_date < earliest_date:
                earliest_date = parent_date
            if parent_date > latest_date:
                latest_date = parent_date
    
    consolidated_text = "\n\n---\n\n".join(reversed(combined_text))
    consolidated_entries.append({
        'id': delta_comment_id,
        'author': sc,
        'combined_text': consolidated_text,
        'earliest_date': earliest_date,
        'latest_date': latest_date
    })

# Create a new DataFrame with consolidated entries
consolidated_data_refined = pd.DataFrame(consolidated_entries)

# Display the first few rows of the consolidated data
consolidated_data_refined.head()

#consolidated_data_refined.to_csv('consolidated_data_refined.csv', index=False)


NameError: name 'get_ancestors' is not defined