# Text Analysis

In [73]:
import re
import nltk
import json
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download the NLTK resources (only run once)
# nltk.download('punkt')
# nltk.download('stopwords')

In [81]:
# loading rules dataset
rules = {}
for i in range(1, 401):
    with open(f'data/rules-ai-v2/{i}.json', 'r') as f:
        data = json.load(f)
        for d in data:
            rules[d['subreddit']] = d['rules']

## Dataset Description

In [82]:
# no. of subreddits
unique_subreddits = set()
for subreddit in rules.keys():
    unique_subreddits.add(subreddit)
print('total subreddits:', len(rules))
print('unique subreddits:', len(unique_subreddits))
print('diff:', len(rules) - len(unique_subreddits))

total subreddits: 99786
unique subreddits: 99786
diff: 0


## Rule Frequency

In [83]:
# gather all subreddits with at least one AI-related rule
ai_rules = {}
for subreddit in rules:
    for rule in rules[subreddit]:
        if rule['ai_rule-short_name'] or rule['ai_rule-description']:
            if subreddit in ai_rules:
                ai_rules[subreddit].append(rule)
            else:
                ai_rules[subreddit] = [rule]
print('no. subreddits with an AI-related rule:', len(ai_rules))

no. subreddits with an AI-related rule: 2730


In [72]:
# visual inspection of AI-related rules
with open('ai-rules.txt', 'w') as f:
    for subreddit in ai_rules:
        for rule in ai_rules[subreddit]:
            f.write(f"subreddit: {subreddit}\nshort name: {rule['short_name']}\ndescription: {rule['description']}\n\n--------------------------\n\n")        

In [67]:
def preprocess_text(text):
    # Initialize the stemmer
    stemmer = PorterStemmer()
    # Convert to lowercase
    text = text.lower()
    # Load English stop words
    stop_words = set(stopwords.words('english'))
    # Replace links with [URL]
    text = re.sub(r'http[s]?://\S+|www\.\S+', '[URL]', text)
    # Remove non-alphanumeric characters (keep spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Tokenize the text
    words = text.split()
    # Stem words
    processed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    # Join words back into a single string
    return processed_words

def calculate_term_frequency(documents):
    terms = {}
    terms_total = 0
    for doc in documents:
        for term in doc:
            terms_total += 1
            if term in terms:
                terms[term] += 1
            else:
                terms[term] = 1
    for term in terms:
        terms[term] /= terms_total
    return terms

# Load and preprocess documents
docs = []
for subreddit in ai_rules:
    for rule in ai_rules[subreddit]:
        r = rule['description']
        if r is not None:
            docs.append(preprocess_text(rule['description']))

# Calculate term frequency
term_frequency = calculate_term_frequency(docs)

# Print term frequencies
sorted_term_frequency = sorted(term_frequency.items(), key=lambda x: x[1], reverse=True)
for item in sorted_term_frequency:
    print(f'{item[0]}: {round(item[1] * 100, 3)}%')

post: 4.122%
ai: 3.575%
imag: 3.121%
content: 1.875%
art: 1.679%
remov: 1.517%
gener: 1.479%
includ: 1.287%
must: 1.274%
titl: 1.109%
allow: 1.068%
ban: 1.0%
may: 0.923%
use: 0.876%
dont: 0.833%
text: 0.829%
except: 0.795%
qualiti: 0.795%
artist: 0.776%
result: 0.761%
photo: 0.751%
mod: 0.732%
discret: 0.721%
ask: 0.717%
publish: 0.694%
fake: 0.671%
aigener: 0.66%
pleas: 0.644%
comment: 0.621%
attempt: 0.607%
enhanc: 0.582%
watermark: 0.579%
better: 0.579%
ie: 0.577%
aienhanc: 0.577%
sole: 0.576%
offens: 0.567%
overli: 0.567%
shop: 0.564%
emoji: 0.56%
weird: 0.559%
cartoonishamateurish: 0.559%
artwork: 0.555%
seriou: 0.553%
xray: 0.553%
white: 0.553%
complain: 0.549%
bubbl: 0.549%
bar: 0.549%
black: 0.547%
border: 0.546%
link: 0.537%
subreddit: 0.522%
origin: 0.479%
meme: 0.441%
discuss: 0.434%
work: 0.421%
sourc: 0.411%
effort: 0.38%
video: 0.377%
low: 0.377%
oc: 0.364%
rule: 0.35%
see: 0.341%
also: 0.333%
creat: 0.294%
etc: 0.277%
submiss: 0.26%
credit: 0.259%
repost: 0.257%
share: 0

## Measuring level of detail

In [68]:
# distribution of length of AI rules (short name and description)
short_names = []
descriptions = []
for subreddit in ai_rules:
    for rule in ai_rules[subreddit]:
        if rule['ai_rule-short_name']:
            short_names.append(len(rule['short_name'].split(' ')))
        if rule['ai_rule-description']:
            descriptions.append(len(rule['description'].split(' ')))

plt.hist(short_names, bins=10, edgecolor='black')
plt.title('Length of Short Names')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.show()

In [69]:
# average length of descriptions
...