# Simple Summary of the Magic Café Dataset

In [21]:
import json
import pandas as pd

### Average number of posts per thread

In [15]:
post_sum = 0
thread_count = 0
with open('./data/themagiccafe_all_clean.jsonlines') as input_file:
    for row in input_file:
        data = json.loads(row)
        thread_count += 1
        post_sum += len(data['posts'])

In [16]:
print(f"Total Threads: {thread_count}")
print(f"Total Posts: {post_sum}")
print(f"Average Posts per Thread: {post_sum/thread_count:.2f}")

Total Threads: 321463
Total Posts: 6029642
Average Posts per Thread: 18.76


### Most popular Subforum

In [56]:
subforum_counts = {}

with open('./data/themagiccafe_all_clean.jsonlines') as input_file:
    for row in input_file:
        data = json.loads(row)
        try:
            subforum_name = data['title'][1]
        except:
            continue
        subforum_counts[subforum_name] = subforum_counts.get(subforum_name, 0) + 1


The workers 25534
Penny for your thoughts 23259
Not very magical, still... 20599
Latest and Greatest? 16486
Tricks & Effects 13564
Nothing up my sleeve... 11094
The spooky, the mysterious...the bizarre! 10937
Our new arrivals 10803
The little darlings 10329
Grand illusion 8638


In [58]:
sorted_counts = sorted(subforum_counts.items(), key=lambda x: x[1])
for i, k in enumerate(sorted_counts[::-1][:10]):
    print(i+1, k[0], k[1])

1 The workers 25534
2 Penny for your thoughts 23259
3 Not very magical, still... 20599
4 Latest and Greatest? 16486
5 Tricks & Effects 13564
6 Nothing up my sleeve... 11094
7 The spooky, the mysterious...the bizarre! 10937
8 Our new arrivals 10803
9 The little darlings 10329
10 Grand illusion 8638


### Thread with the most posts

In [39]:
max_posts = None
thread = None
max_thread_name = None

all_counts = []

with open('./data/themagiccafe_all_clean.jsonlines') as input_file:
    for row in input_file:
        data = json.loads(row)
        try:
            thread_name = " >> ".join(data['title'])
        except:
            continue
        all_counts.append({'title': thread_name, 'count': len(data['posts']), 'link': data['link']})
        if max_posts is None or len(data['posts']) > max_posts:
            max_posts = len(data['posts'])
            max_thread_name = " >> ".join(data['title'])
            thread = data

In [40]:
print(f"Thread with the Most Posts {max_thread_name}")
print(f"Post Count: {max_posts}")
print(f"link: {thread['link']}")

Thread with the Most Posts The Magic Cafe Forum Index >> Not very magical, still... >> Answer a Question with a Question
Post Count: 25012
link: https://www.themagiccafe.com/forums/viewtopic.php?topic=250916&forum=32


In [48]:
all_counts2 = sorted(all_counts, key=lambda x: x['count'])

for i, x in enumerate(all_counts2[::-1][:10]):
    print(f"Rank: {i+1} Count: {x['count']}\nTitle: {x['title']}\n Link: {x['link']}\n")

Rank: 1 Count: 25012
Title: The Magic Cafe Forum Index >> Not very magical, still... >> Answer a Question with a Question
 Link: https://www.themagiccafe.com/forums/viewtopic.php?topic=250916&forum=32

Rank: 2 Count: 13703
Title: The Magic Cafe Forum Index >> The workers >> Card Trick Game
 Link: https://www.themagiccafe.com/forums/viewtopic.php?topic=181702&forum=2

Rank: 3 Count: 11651
Title: The Magic Cafe Forum Index >> Not very magical, still... >> New Game: You're Banned
 Link: https://www.themagiccafe.com/forums/viewtopic.php?topic=385331&forum=32&start=0

Rank: 4 Count: 8620
Title: The Magic Cafe Forum Index >> Now that’s funny! >> Gag tag
 Link: https://www.themagiccafe.com/forums/viewtopic.php?topic=230576&forum=24

Rank: 5 Count: 8359
Title: The Magic Cafe Forum Index >> Now that’s funny! >> Three words  TOPIC IS LOCKED
 Link: https://www.themagiccafe.com/forums/viewtopic.php?topic=247474&forum=24

Rank: 6 Count: 7884
Title: The Magic Cafe Forum Index >> Now that’s funny! >>

### Longest Post

In [49]:
post = None
long_post = None

with open('./data/themagiccafe_all_clean.jsonlines') as input_file:
    for row in input_file:
        data = json.loads(row)
        for p in data['posts']:
            p_len = len(p['post'].split())
            if long_post is None or p_len > long_post:
                long_post = p_len
                post = data
                
print(f"Longest Post Word Count: {long_post}")
print(f"Post Title: {post['title']}")
print(f"Post Link: {post['link']}")

Longest Post Word Count: 31977
Post Title: ['The Magic Cafe Forum Index', 'Ever so sleightly', 'Sponge ball routines']
Post Link: https://www.themagiccafe.com/forums/viewtopic.php?topic=488112&forum=115


### OTHER

In [None]:
# The title is messed up slightly for ~15k posts. This will be fixed in a future release.
cnt = 0
with open('./data/themagiccafe_all_clean.jsonlines') as input_file:
    for row in input_file:
        data = json.loads(row)
        if 'title' in data:
            if len(data['title']) <= 2:
                cnt += 1
print(cnt)