<a href="https://colab.research.google.com/github/radonys/Reddit-HateSpeech-Application/blob/master/Reddit_HateSpeech_UserInput_Processor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Reddit Hate-Speech User Input Collection

## Libraries

### Install

In [0]:
#!pip install praw

### Import

In [17]:
import sklearn
import pickle
import praw
import re
from bs4 import BeautifulSoup
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Variable Declaration

In [0]:
reddit = praw.Reddit(client_id='#', client_secret='#', user_agent='#', username='#', password='#')
model = pickle.load(open('Models/finalized_model.sav', 'rb'))

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

## Utility Functions

In [0]:
def clean_text(text):
   
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [0]:
def comment_hate_check(comments, loaded_model):

  hate = np.zeros(len(comments))

  for idx in range(0, len(comments)):

    comment = clean_text(comments[idx])
    hate[idx] = loaded_model.predict([comment])

  return hate

## Input URL Processing

In [0]:
def url_process(url,loaded_model):

  submission = reddit.submission(url=url)

  data = {}

  data['title'] = submission.title
  data['url'] = submission.url
  data['id'] = submission.id
  data['comm_num'] = submission.num_comments
  data['created'] = submission.created
  data['body'] = submission.selftext

  submission.comments.replace_more(limit=None)
  comments = []

  counter = 0

  for top_level_comment in submission.comments:
    
    comments.append(top_level_comment.body)
    counter += 1

    if counter == 5:
      break

  data['hate_user_input'] = 1

  data["comment"] = comments

  data['title'] = clean_text(data['title'])
  data['body'] = clean_text(data['body'])

  data['hate_body'] = loaded_model.predict([data['body']])
  data['hate_title'] = loaded_model.predict([data['title']])
  data['hate_comment_onehot'] = comment_hate_check(comments, loaded_model)

  return data

In [45]:
url_process('https://www.reddit.com/r/politics/comments/gkwi25/saturday_morning_political_cartoon_thread/', model)

{'body': 'saturday folks lets kick back cup coffee share cartoonsfeel free share political cartoons thread besides usual civility policy three rules follow1 every toplevel comment must contain political cartoon means textonly toplevel comments2 must _original cartoon_ means photographs edited cartoons memes image macros oc allowed animation 3 toplevel comment maximum 3 cartoons thats enjoy weekend',
 'comm_num': 231,
 'comment': ['[Matt Davies, Newsday](https://projects.newsday.com/cartoons/opinion/matt-davies-political-cartoons/)\n\n* [Ignoring the Real Hazard](https://i.imgur.com/Z7AqCtp.png)',
  'Bruce MacKinnon [Imgur](https://i.imgur.com/hVahSfI.jpg)',
  'Nick Anderson: [Armed Protesters](https://pbs.twimg.com/media/EYCuX9_XsAAniAG.jpg)\n\nNick Anderson: [Preparation](https://pbs.twimg.com/media/EYBzNWyXgAE5oqC.jpg)\n\nMichael de Adder: [How Facebook Eliminates Fake News](https://pbs.twimg.com/media/EX-wwHjWkAEqw_x.jpg)',
  'Clay Jones: \n\n[Fox v. Fauci](https://claytoonz.files.w