In [1]:
#Create Comment Trees
import pandas as pd

def process_comments(data):
    """
    Process the comments data to assign top-level IDs to each comment.
    
    Parameters:
        - data (DataFrame): The comments data.
        
    Returns:
        - DataFrame: The processed data with top-level IDs.
    """
    # Create a dictionary mapping each comment's id to its parent_id
    id_to_parent = dict(zip(data['id'], data['parent_id']))
    
    def find_top_level_id_fast(comment_id):
        """
        Find the top-level ID for a given comment ID.
        
        Parameters:
            - comment_id (str): The comment ID.
            
        Returns:
            - str: The top-level comment ID.
        """
        current_id = comment_id
        current_parent_id = id_to_parent.get(current_id, None)
        
        # Keep iterating until we find a top-level comment
        while current_parent_id and not current_parent_id.startswith("t3"):
            current_id = current_parent_id[3:]
            current_parent_id = id_to_parent.get(current_id, None)
        
        return current_id
    
    # Initialize the top_level_id column with NaN values
    data['top_level_id'] = None

    # For each entry, if it's a top-level comment, assign its own id to top_level_id
    data.loc[data['parent_id'].str.startswith("t3"), 'top_level_id'] = data['id']

    # Apply the function to find top_level_id for each non-top-level entry
    non_top_level_rows = data[data['top_level_id'].isnull()]
    data.loc[non_top_level_rows.index, 'top_level_id'] = non_top_level_rows['id'].apply(find_top_level_id_fast)
    
    return data

if __name__ == "__main__":
    # Load the data (replace with your file path)
    data = pd.read_csv('CMV_July_2022.csv')
    
    # Process the data
    processed_data = process_comments(data)
    
    # Display a sample
    print(processed_data[['id', 'parent_id', 'top_level_id']].sample(10))


             id   parent_id top_level_id
104111  ih45wlj  t1_ih441r7      ih2ewuo
130268  ihu3agn  t1_ihtmn5k      ihtlfyo
30225   if66pq5   t3_vt98nz      if66pq5
73334   igai7ep  t1_igafgba      iga7w4z
78656   igh54x9   t3_w0wwmn      igh54x9
34844   ifa0m3x  t1_if9wtlz      if9t1on
2470    ieiw5iu  t1_ieic208      iehpw22
49673   ifnmv64  t1_ifnkgxb      ifnkgxb
111025  ihcoyef  t1_ihcnb0o      ihcjrlk
147982  iif08zn   t3_wc7pu1      iif08zn


In [7]:
#Filter to only include where OP responded

def filter_by_submitter(data):
    """
    Filters the data to include only those comment trees where at least one 
    of the entries has "is_submitter" set to true.
    
    Parameters:
        - data (DataFrame): The comments data.
        
    Returns:
        - DataFrame: The filtered data.
    """
    # Identify top-level IDs where at least one of the entries has "is_submitter" as true
    top_level_ids_with_submitter = data[data['is_submitter'] == True]['top_level_id'].unique()
    
    # Filter the dataset to include only these comment trees
    filtered_data = data[data['top_level_id'].isin(top_level_ids_with_submitter)]
    
    return filtered_data

# Apply the filter
data_filtered_by_submitter = filter_by_submitter(data)

# Display a sample
data_filtered_by_submitter.sample(5)


Unnamed: 0.1,Unnamed: 0,id,link_id,parent_id,CreatedAt,author,ups,downs,score,edited,...,stickied,is_submitter,body,error,subreddit,Month,Year,Day,top_level_id,received_delta
73218,73218,igadwwq,t3_vzor14,t1_igaddup,2022-07-15 10:23:06,MostRecommendation84,,,9,False,...,False,False,[https://extension.psu.edu/house-centipedes](h...,,changemyview,7,2022,15,ig9r0bs,False
28812,28812,if4s739,t3_vsqn4v,t1_if2xkfg,2022-07-06 15:28:13,Claytertot,,,2,False,...,False,False,It's not about when it starts mattering. One ...,,changemyview,7,2022,6,if2qcgl,False
121189,121189,ihm1rov,t3_w7ihu2,t1_ihjvwkw,2022-07-25 11:44:10,Murkus,,,2,False,...,False,False,"Well, I do believe that a comedian can say tho...",,changemyview,7,2022,25,ihjvwkw,False
98935,98935,igytfai,t3_w2isj5,t1_igqppgr,2022-07-20 14:05:48,[deleted],,,1,False,...,False,False,[removed],,changemyview,7,2022,20,igqga36,False
78287,78287,iggttqb,t3_w0tfvm,t1_igglmt0,2022-07-16 18:38:42,praetorain112,,,-1,False,...,False,True,You are picking one party as your example and ...,,changemyview,7,2022,16,igglmt0,False


In [8]:
#Filter to only include posts with at least 30 words in top-level comment

def filter_by_word_count(data, threshold=30):
    """
    Filters the data to include only those comment trees where the top-level 
    comment contains at least a specified threshold of words in the "body" field.
    
    Parameters:
        - data (DataFrame): The comments data.
        - threshold (int): The minimum word count for top-level comments.
        
    Returns:
        - DataFrame: The filtered data.
    """
    # Identify top-level comments
    top_level_comments = data[data['parent_id'].str.startswith("t3")]
    
    # Count words in the body field of top-level comments
    word_counts = top_level_comments['body'].str.split().apply(len)
    
    # Identify top-level IDs where the body has at least 'threshold' words
    top_level_ids_with_threshold_words = top_level_comments[word_counts >= threshold]['id']
    
    # Filter the dataset to include only these comment trees
    filtered_data = data[data['top_level_id'].isin(top_level_ids_with_threshold_words)]
    
    return filtered_data

# Apply the filter
data_filtered_by_word_count = filter_by_word_count(data)

# Display a sample
data_filtered_by_word_count.sample(5)


Unnamed: 0.1,Unnamed: 0,id,link_id,parent_id,CreatedAt,author,ups,downs,score,edited,...,stickied,is_submitter,body,error,subreddit,Month,Year,Day,top_level_id,received_delta
109164,109164,ihb1jn1,t3_w60i2f,t3_w60i2f,2022-07-23 03:17:19,Nearbykingsmourne,,,190,False,...,False,False,That activism tends to seep into places where ...,,changemyview,7,2022,22,ihb1jn1,False
110515,110515,ihcc7bq,t3_w4rovf,t1_ihc9b79,2022-07-23 10:03:07,parentheticalobject,,,1,False,...,False,False,&gt; What they did being fraud is undecided. T...,,changemyview,7,2022,23,ih3qqea,False
149278,149278,iig0vmb,t3_wczne2,t3_wczne2,2022-07-31 16:46:25,tyzzex,,,1,True,...,False,False,There are spots at beaches that aren't so crow...,,changemyview,7,2022,31,iig0vmb,False
56791,56791,iftxk67,t3_vx5eum,t1_iftxczu,2022-07-12 00:46:56,Hellioning,,,2,False,...,False,False,"No, but some genes are discriminated against i...",,changemyview,7,2022,11,iftwqtp,False
103045,103045,ih3flxh,t3_w4il5d,t1_ih38jqs,2022-07-21 12:57:40,nhlms81,,,2,False,...,False,True,!delta. there is sound reason to believe that ...,,changemyview,7,2022,21,ih2k3j0,False


In [9]:
#code for identifying deltas

# Create a new column to indicate if a comment received a delta
data['received_delta'] = False

# Identify delta comments
delta_comments = data[data['body'].str.contains('!delta|Δ', na=False, case=False, regex=True)]

# Mark comments that received a delta based on their parent comment
data.loc[data['id'].isin(delta_comments['parent_id'].str.replace("t1_", "")), 'received_delta'] = True

print(data[data['received_delta']].head(5))


      Unnamed: 0       id    link_id   parent_id            CreatedAt  \
51            51  iegehw5  t3_vouzwo  t1_ieff04p  2022-07-01 07:24:16   
149          149  iegid5p  t3_vouzwo   t3_vouzwo  2022-07-01 07:51:10   
285          285  iegon8d  t3_vp2cm8  t1_iegnggu  2022-07-01 08:34:04   
436          436  iegtx9p  t3_vn1v0q  t1_iegqui0  2022-07-01 09:09:15   
1677        1677  iehqjck  t3_vp4op3  t1_iehop4t  2022-07-01 12:51:42   

                  author  ups  downs  score  edited  ... stickied  \
51          LooseBar2222  NaN    NaN      4   False  ...    False   
149     mynewaccount4567  NaN    NaN      5   False  ...    False   
285          countrymace  NaN    NaN      2   False  ...    False   
436   littlesmilinghooks  NaN    NaN      1   False  ...    False   
1677           [deleted]  NaN    NaN      3    True  ...    False   

     is_submitter                                               body  error  \
51          False  I think it's all about perception by the next ..