In [1]:
import requests
import re
import pandas as pd
import json
import time
import numpy as np

# requires pip install sentence-transformers==2.2.2
from sentence_transformers import SentenceTransformer

In [2]:
def get_reddit(subreddit, after=None):   
    source_file = f'reddit-{subreddit}.txt'
    base_url = f'https://www.reddit.com/r/{subreddit}/controversial.json?limit=100&t=year'
    
    if after is not None:
        base_url += f'&after=t3_{after}'
        source_file = f'reddit-{subreddit}_{after}.txt'        
        
    print(f'Loading posts from {base_url}')        
    #request = requests.get(base_url, headers = {'User-agent': 'development'})
    #response = request.json()
    time.sleep(0.5) 
    with open(source_file, "r") as infile:
        return json.load(infile)

In [3]:
raw_posts = {}
for subreddit in ['bullying', 'selfimprovement', 'depression', 'FengShui']:
    raw_posts[subreddit] = []
    last_identifier = None
    for _ in range(0, 10):
        raw_response = get_reddit(subreddit, after=last_identifier)

        for post in raw_response['data']['children']:
                raw_posts[subreddit].append(post)            
                last_identifier = post['data']['id']

Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year
Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year&after=t3_w9924g
Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year&after=t3_zagcku
Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year&after=t3_ycqpjc
Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year&after=t3_xf1ifr
Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year&after=t3_wbzt8u
Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year&after=t3_vd11ym
Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year&after=t3_ufos2a
Loading posts from https://www.reddit.com/r/bullying/controversial.json?limit=100&t=year&after=t3_ta46qm
Loading posts from https://www.reddit.com/r/bullying/controversial.json

In [4]:
subreddits = ['bullying', 'selfimprovement', 'depression', 'FengShui']

prepared_sentences = []
for subreddit in subreddits:
    for num, post in enumerate(raw_posts[subreddit]):
        identifier = post['data']['id']
        text = post['data']['selftext']
        sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
        for index, sentence in enumerate(sentences):
            sentence_id = identifier + '_' + str(index)
            prepared_sentences.append((sentence_id, sentence, subreddit))
            
labeled_sentences = pd.DataFrame.from_records(prepared_sentences, columns=['id', 'text', 'origin'])            

In [5]:
labeled_sentences

Unnamed: 0,id,text,origin
0,x6d5qc_0,I was bullied by girls all my life but I would...,bullying
1,x6d5qc_1,They can't even bully rightly like verbally as...,bullying
2,x6d5qc_2,Nothing\nThey can't even bully rightly \nAlso ...,bullying
3,x6d5qc_3,\nAlso don't watch a woman's cunt only a man's...,bullying
4,x6d5qc_4,,bullying
...,...,...,...
39796,rpabe4_8,width=3371&amp;format=png&amp;auto=webp&amp;s=...,FengShui
39797,rp6j88_0,Just wondering if anyone else is doing a deep...,FengShui
39798,rp6j88_1,I got into the cleaning spirit on bedroom clos...,FengShui
39799,rp6j88_2,"Planning on painting by spring , so cleaning n...",FengShui


In [6]:
url_regex = '((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*'
filtered_sentences = labeled_sentences[labeled_sentences.text.str.len() > 20]
filtered_sentences = filtered_sentences[~filtered_sentences.text.str.match(url_regex)]

In [7]:
filtered_sentences

Unnamed: 0,id,text,origin
0,x6d5qc_0,I was bullied by girls all my life but I would...,bullying
1,x6d5qc_1,They can't even bully rightly like verbally as...,bullying
2,x6d5qc_2,Nothing\nThey can't even bully rightly \nAlso ...,bullying
3,x6d5qc_3,\nAlso don't watch a woman's cunt only a man's...,bullying
5,vcbknn_0,"""Bullying is coercion--Implicit or explicit th...",bullying
...,...,...,...
39790,rpabe4_2,"In other words, all the other houses on that s...",FengShui
39796,rpabe4_8,width=3371&amp;format=png&amp;auto=webp&amp;s=...,FengShui
39797,rp6j88_0,Just wondering if anyone else is doing a deep...,FengShui
39798,rp6j88_1,I got into the cleaning spirit on bedroom clos...,FengShui


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

filtered_sentences['label'] = filtered_sentences['origin'].isin(['bullying', 'depression'])

train_data, test_data = train_test_split(filtered_sentences, test_size=0.2, random_state=42, shuffle=True)

# There are better, more expensive models: https://www.sbert.net/docs/pretrained_models.html
model = SentenceTransformer('all-MiniLM-L6-v2')

encoder = FunctionTransformer(lambda row: model.encode(row['text'].values))

X_train = encoder.transform(train_data)
X_test = encoder.transform(test_data)

In [9]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.python.keras.optimizer_v2.gradient_descent import SGD 
from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier

def create_model(input_dim=384):
    clf = Sequential()
    clf.add(Dense(16, activation='relu', input_dim=input_dim))
    clf.add(Dense(8, activation='relu'))
    clf.add(Dense(2, activation='softmax'))
    clf.compile(loss='categorical_crossentropy', optimizer=SGD(), metrics=["accuracy"])
    return clf

neuralnet = KerasClassifier(build_fn=create_model, epochs=20, batch_size=64, verbose=1)

In [10]:
neuralnet.fit(X_train, train_data['label'])

2022-12-29 12:05:37.453968: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-29 12:05:37.623849: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x163a50b50>

In [11]:
neuralnet.score(X_test, test_data['label'])



0.7467399835586548