## Import packages

In [21]:
import pandas as pd
import csv

## Define RedditComment Class and Functions

In [None]:
#define the class object
class RedditComment:
    def __init__(self, id, author, is_submitter=False, content="", has_delta_from_OP="False"):
        self.id = id
        self.author = author
        self.content = content
        self.is_submitter = is_submitter
        self.children = []
        #self.op_replied = is_submitter #no longer needed, eventually delete
        self.is_tlc_author = False
        self.has_delta = has_delta_from_OP.lower() == "true"  # Ensure it's stored as a boolean
    def total_children(self):
        count_children = 0
        for child in self.children:
            count_children += 1
            count_children += child.total_children()
        return count_children

    def update_op_replied(self):
        # Check if any child has is_submitter set to True
        if any(child.is_submitter for child in self.children):
            self.is_submitter = True
        # Update all children to propagate is_submitter status downward if needed
        for child in self.children:
            child.update_op_replied()
    def mark_tlc_author_comments(self, tlc_author):
        # Recursively mark if a comment is by the TLC author
        if self.author == tlc_author:
            self.is_tlc_author = True
        for child in self.children:
            child.mark_tlc_author_comments(tlc_author)
    def delta_in_tree(self):
        # Check if this comment or any child is from the TLC author and has a delta
        if self.is_tlc_author and self.has_delta:
            self.has_delta = True  # Explicitly mark it, though it's already True
        # Recursively check children and propagate if any child meets both conditions
        if any(child.is_tlc_author and child.has_delta for child in self.children):
            self.has_delta = True
        for child in self.children:
            child.delta_in_tree()
    def only_OP_and_TLC(self):
        # Check if this comment and all children are from the TLC author or the submitter
        if len(self.children) == 0:
            children_are_candidates = False
        else:
            children_are_candidates = all(child.only_OP_and_TLC() for child in self.children)
        if (self.is_tlc_author or self.is_submitter) and children_are_candidates:
            return True
        return False

## Execute Class Functions on Data

In [None]:
if __name__ == "__main__":
    filepath = '/Users/ryanfunkhouser/Documents/Research/backup_cmv_computational_small_stories/data/CMV_purged_columns.csv'
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        # Initialize a dictionary to map id to RedditComment objects
        all_comments_dict = {}
        # Initialize a list for top-level comments
        top_level_comments = []
        # Process each row from the CSV
        for comment in reader:
            author = comment['author']
            content = comment.get('body', '').strip()  # Safely extract and ensure it's a string
            #print(f"Extracted content: '{content[:50]}'")  # Debugging line
            is_submitter = comment['is_submitter'].lower() == 'true'
            has_delta_from_OP = comment.get('has_delta_from_OP', 'False')  # Default to "False" if missing
            reddit_comment = RedditComment(comment['id'], author, is_submitter, content, has_delta_from_OP)
            all_comments_dict[comment['id']] = reddit_comment
            # Check if the comment is a top-level comment
            if comment['parent_id'].startswith("t3"):
                top_level_comments.append(reddit_comment)
            else:
                # This assumes that the parent_id field is prefixed with "t1_" when the parent is a comment
                parent_comment_id = comment['parent_id'][3:]
                parent_comment = all_comments_dict.get(parent_comment_id)
                if parent_comment:
                    parent_comment.children.append(reddit_comment)
    candidate_conversations = []
    for comment in top_level_comments:
        comment.mark_tlc_author_comments(comment.author)  # Pass the author of the top-level comment
        comment.update_op_replied()
        comment.delta_in_tree()
        comment.only_OP_and_TLC()

        if comment.only_OP_and_TLC():  # Use the function result
            candidate_conversations.append(comment)
    # Output the number of valid conversations
    print(f"Number of valid candidate conversations: {len(candidate_conversations)}")
    # Print IDs of valid conversations
    for convo in candidate_conversations:
        print(f"TLC ID: {convo.id}, TLCer: {convo.author}")

## Various Iterations of Filtering and Validation

In [27]:
#display the number of top level comments
print(f"Number of top-level comments: {len(top_level_comments)}")
for comment in top_level_comments:
    print(comment.id)

Number of top-level comments: 27741
iegb6f2
iegb894
iegbbaw
iegbhth
iegbrnj
iegbuhl
iegcjl9
iegckf5
iegcv7c
iegd50v
iegdhii
iegdsrh
iege0js
iege7q0
iegeie7
iegeztf
iegf0u5
iegf1mi
iegf2v8
iegfci7
iegfk90
iegfnf2
iegfop4
iegfqx0
iegg6mr
iegg9es
iegghdu
ieggi6u
ieggvjo
ieggx5s
iegh2if
iegh3gu
iegh66h
ieghfly
ieghkni
ieghp9m
ieghz0w
iegid5p
iegie6g
iegir7q
iegj6iv
iegjcm2
iegjcsf
iegjqaf
iegjv5s
iegk15r
iegk2c1
iegk876
iegkd9j
iegkdxr
iegkeib
iegkme5
iegl0pd
iegl8v0
ieglgvc
ieglor4
ieglsn7
iegm5y8
iegm5yz
iegmaqv
iegmaxx
iegmdfh
iegn9fh
iegn9h7
iegnjgb
iegnk7y
iegnso6
iego766
iegoca3
iegoj9p
iegpau9
iegpbks
iegptqw
iegpvio
iegq618
iegqb1j
iegqie0
iegqpxe
iegquw2
iegr1kv
iegrdt2
iegro5j
iegrqz3
iegs0x9
iegs2wz
iegs4sm
iegs5an
iegsqec
iegswxz
iegtdi5
iegtmjb
iegtpkl
ieguav0
iegucv2
iegufkv
iegufu6
iegui6f
ieguji9
ieguk31
iegukl0
iegus58
iegus6z
iegv5rb
iegvl8q
iegvm6h
iegvmp2
iegvx4d
iegvy9o
iegw6st
iegx7cy
iegx82n
iegx9sh
iegxbbg
iegxd8z
iegxpp7
iegy0zj
iegy9rg
iegye5q
iegytcs
iegyz78
iegz

In [12]:
#checking number of deltas
delta_count = 0
for comment in top_level_comments:
    if  comment.has_delta == True:
        delta_count += 1

print(f"Number of top-level comments where there is a delta in the tree: {delta_count}")

Number of top-level comments where there is a delta in the tree: 0


In [14]:
#checking how many comment trees had the OP respond AND had a delta awarded to the TLC author
def print_filtered_trees(comments):
    valid_trees_count = 0
    for comment in comments:
        print(f"Top Level Comment: {comment.id}, op_replied: {comment.op_replied}, has_delta: {comment.has_delta()}")
        if comment.op_replied and comment.any_tlc_author_has_delta():
            print("\nTop Level Comment:", comment.id)
            valid_trees_count += 1
    print(f"\nTotal Comment Trees Matching Criteria: {valid_trees_count}")
print_filtered_trees(top_level_comments)

TypeError: 'bool' object is not callable

In [15]:
#prints the comment tree structure
def print_comment_structure(comment, level=0):
    # Print the current comment's ID and its children count, indented by the level in the hierarchy
    # Include a marker or note if the comment is by the TLC author
    tlc_author_marker = " (TLC Author)" if comment.is_tlc_author else ""    
    # Print the current comment's ID, its children count, and if it's by the TLC author
    #print('  ' * level + f"{comment.id} ({comment.total_children()} children){tlc_author_marker}")
    print('  ' * level + f"{comment.id} ({comment.content[:50]}){tlc_author_marker}")
    for child in comment.children:
        print_comment_structure(child, level + 1)

#keeps it from breaking if it's too big
for reddit_comment in top_level_comments:
    count = 0
    print_comment_structure(reddit_comment)
    count += 1
    if count >= 20:
        break

iegb6f2 (Where are you from? This mindset may very well not) (TLC Author)
iegb894 (I've literally had people tell me that when Ive to) (TLC Author)
iegbbaw (Sounds like you're talking about American hip hop ) (TLC Author)
iegbhth (It exists in places where the public transport suc) (TLC Author)
  iegh68k (I think this is the biggest factor. In a lot of ci)
  iegk069 (I completely agree with this stance. I'm from a we)
iegbrnj (When you're from a major city that actually invest) (TLC Author)
iegbuhl (Depends where you live when public transport is li) (TLC Author)
iegcjl9 (Eat when your body tells you to eat, not on some c) (TLC Author)
iegckf5 (gangsta rap in the 90s was a way for underprivileg) (TLC Author)
iegcv7c (The work is optional mentality has already succeed) (TLC Author)
  iegsie8 (This stuff isn't new. And more importantly, gettin)
iegd50v (There are many places where the public transport i) (TLC Author)
  iegdir7 (I live in Sydney australia, hardly nye or london. )
    iege

In [None]:
#creates and prints the comment tree structure showing only the TLC comments in trees where the OP replied
def print_tlc_comment_structure(comment, is_top_level=True):
    # Base check at the top level: proceed only if OP replied in this subtree
    if is_top_level and not comment.op_replied:
        return

    # Print or process the comment only if it is by the TLC author
    if comment.is_tlc_author:
        print(f"{comment.id} (TLC Author) : {comment.total_children()} children")

    # Recurse into children to continue checking and printing as necessary
    for child in comment.children:
        print_tlc_comment_structure(child, is_top_level=False)

for comment in top_level_comments:
    comment.update_op_replied()  # Make sure the op_replied is up to date
    print("\nTop Level Comment:", comment.id)
    print_tlc_comment_structure(comment)


In [None]:
#validation check for capturing if OP replied
# Print only those top-level comments where the OP has replied in the subtree
op_replied_count = 0
for comment in top_level_comments:
    if comment.op_replied:
        print_comment_structure(comment)
        op_replied_count += 1
print(f"Total top-level comments where the OP replied: {op_replied_count}")



In [None]:
#validation check for printing the top 5 TLCs with the most comments under them. 
# Calculate the total number of children for each top-level comment
top_level_comments_with_children = [(comment, comment.total_children()) for comment in top_level_comments]

# Sort the top-level comments based on the total number of children
sorted_top_level_comments = sorted(top_level_comments_with_children, key=lambda x: x[1], reverse=True)

# Select the top 5 comments with the most children
top_5_comments = sorted_top_level_comments[:5]

# Print the comment structure for each of the top 5 comments
for comment, _ in top_5_comments:
    print("\nTop Level Comment:", comment.id)
    print_comment_structure(comment)