In [1]:
import TreeTools as tt
trees = tt.load_list_of_trees('80919_labeled_trees.txt')
print('Total trees loaded:', len(trees))

Total trees loaded: 46


In [2]:
# Let's investigate one of the trees from that list
tree = trees[0]
nodes = tt.get_nodes(tree)
branches = tt.get_branches(tree)
print('Our tree has %d branches and total of %d unique nodes' % (len(branches), len(nodes)))
print('Type of tree (hint: dictionary):', type(tree))
print('A tree has the following keys:', tree.keys())
print('A node is a dictionary too and has the following keys: ', tree['node'].keys())
print(tree['node']['author'],':',tree['node']['text'][:50],'...')
print('A children is a list, where each element is a dictionary just like the "tree" object and has keys "node" & "children" ')
print(tree['children'][0].keys())

Our tree has 25 branches and total of 123 unique nodes
Type of tree (hint: dictionary): <class 'dict'>
A tree has the following keys: dict_keys(['node', 'children'])
A node is a dictionary too and has the following keys:  dict_keys(['text', 'id', 'author', 'extra_data', 'timestamp'])
Slagernicus : Abortion should be legal in my opinion for 4 main  ...
A children is a list, where each element is a dictionary just like the "tree" object and has keys "node" & "children" 
dict_keys(['node', 'children'])


### So I guess you see that our tree is a recursive object.

In [3]:
# A branch is a list of nodes - from root to leaf. Total number of branches equals to a number of leaves in tree.
branch = branches[0]
print('First branch has the length of', len(branch))
print('An element in branch is the node we saw above - it is a dictionary and has the following keys:', branch[0].keys())
print('\nLets print the discussion:\n')
for node in branch:
    print(node['author'],':',node['text'][:50],'...')
print('\nNow let\'s take a closer look at the last node in this branch - it has an extra key "labels":')
print(branch[2].keys())
print('The labels are:',branch[2]['labels'])

First branch has the length of 3
An element in branch is the node we saw above - it is a dictionary and has the following keys: dict_keys(['text', 'id', 'author', 'extra_data', 'timestamp'])

Lets print the discussion:

Slagernicus : Abortion should be legal in my opinion for 4 main  ...
Hq3473 : Up until which point should abortion be legal? Sho ...
Slagernicus : I'm not sure where in the pregnancy I would be ok  ...

Now let's take a closer look at the last node in this branch - it has an extra key "labels":
dict_keys(['author', 'text', 'id', 'extra_data', 'labels', 'timestamp'])
The labels are: {'consolidated': ['CBK', 'SE']}


### So some of the nodes contain the 'labels' key. In the version of trees I gave you their structure will always be {'consolidated': [list_of_labels]}, meaning the given list is already after applying a consolidation. The raw data can be found in the 'extra_data' dict.

In [4]:
# How many labeled nodes do we have in this tree?
labeled_nodes = [node for node in nodes if 'labels' in node and len(node['labels']['consolidated']) > 0]
print('Labeled nodes:', len(labeled_nodes), 'Out of total nodes:',len(nodes)) 
# Surpirse! All labels except the root are labeled.
from collections import Counter
counted_labels = Counter([label for node in labeled_nodes for label in node['labels']['consolidated']])
print('Our label frequencies for this tree are:')
print(counted_labels)

Labeled nodes: 122 Out of total nodes: 123
Our label frequencies for this tree are:
Counter({'CBE': 88, 'OCQ': 22, 'IRR': 21, 'DNO': 19, 'RAA': 15, 'CBK': 14, 'SAC': 14, 'SC': 14, 'SA': 10, 'SAB': 9, 'SAS': 7, 'CA': 7, 'CBG': 7, 'OSB': 6, 'NA': 6, 'OTH': 5, 'SE': 4, 'ADT': 4, 'REP': 4, 'CBZ': 4, 'AGB': 4, 'BAD': 4, 'CBL': 3, 'ALO': 3, 'CBF': 2, 'SG': 1, 'SRC': 1, 'SF': 1, 'CBD': 1, 'CBA': 1, 'ANS': 1, 'CBN': 1, 'CDV': 1})


### Easy-peasy to work with labels.

In [5]:
# Now let's dive a bit into the text - sometimes it contains 'mentions', 'quotes' or is deleted/removed.
print('Deleted nodes:',len([node['text'] for node in nodes if node['text'] == '[deleted]' or node['text'] =='[removed]']))
# The nodes usually are deleted for two reasons: a user changes his mind and deletes his post, or an admin removes it for violating rules
texts_with_quotes = [node['text'] for node in nodes if '<quote>' in node['text']]
print('Texts that contain quotes:',len(texts_with_quotes))
print('\n',texts_with_quotes[0],'\n')
texts_with_mentions = [node['text'] for node in nodes if '/u/' in node['text']]
print('Texts that contain mentions:',len(texts_with_mentions))
# Oops.. no mentions here. May be in another tree?
nodes_with_mentions = [node for node in tt.get_nodes(trees[1]) if '/u/' in node['text']]
print('Texts that contain mentions (another tree):',len(nodes_with_mentions))
# Bingo! Found a mention.
print('\n',nodes_with_mentions[0]['author'],':',nodes_with_mentions[0]['text'],'\n')

Deleted nodes: 2
Texts that contain quotes: 29

 <quote>What ahout when fetus is half way out?</quote> <quote>Can she kill it then? </quote> Only if it is the safest and only way to preserve bodily autonomy. Essentially, she has the right to have it removed from her body in the safest way possible for her. If that ends up killing the baby, then so be it. 

Texts that contain mentions: 0
Texts that contain mentions (another tree): 1

 bubi09 : If /u/philotrow has changed any aspect of your view, please consider giving them a delta. Read comment rule 4 and the delta instructions (both in our sidebar) on how/when to do this if you're unfamiliar with the delta system. Thanks! 



### This summarizes some special tweaks in the text.

In [6]:
# A bit deeper. ExtraData:
extra_data = tree['node']['extra_data']
print('Extra data can have many keys:',extra_data.keys())
# A post can have 'downvotes' and 'upvotes':
print('Upvotes:',extra_data['ups'],'Downvotes:',extra_data['downs'])
# And we can easily find the original discussion on reddit using the 'url':
print('url:',extra_data['url'])
print('If we sort by \'OLD\' a discussion on the reddit website, we\'d get it exactly in the same order as we have it in our branches:')
print('https://www.reddit.com/r/changemyview/comments/4rl42j/cmv_abortion_should_remain_legal/?sort=old')
print('Pay attention that the ID of a tree appears also in the url:',tree['node']['id'])

Extra data can have many keys: dict_keys(['file:line', 'subreddit', 'from_kind', 'from', 'title', 'num_comments', 'subreddit_id', 'downs', 'saved', 'from_id', 'permalink', 'name', 'url', 'ups'])
Upvotes: 17 Downvotes: 0
url: https://www.reddit.com/r/changemyview/comments/4rl42j/cmv_abortion_should_remain_legal/
If we sort by 'OLD' a discussion on the reddit website, we'd get it exactly in the same order as we have it in our branches:
https://www.reddit.com/r/changemyview/comments/4rl42j/cmv_abortion_should_remain_legal/?sort=old
Pay attention that the ID of a tree appears also in the url: 4rl42j


### That's it for now. Feel free to ask questions if you need additional info.