# Exploring Crawled Reddit Data

In [1]:
import os
import sys
from pathlib import Path
from os import PathLike

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import json
from typing import Dict, Any

from urllib import parse

from datetime import date, datetime

import multiprocessing
import mysql.connector
import mysql.connector.pooling

In [2]:
import dataclasses
from dataclasses import dataclass

In [3]:
@dataclass
class Submission:
    is_self: bool
    name: str
    selftext: str
    subreddit_id: str   # this is actually the full name of the subreddit t5_subredditID
    author_fullname: str    # this is the full name of the author t2_authorID
    created: int
    topic: str  # search query that yielded this submission

In [4]:
data_dir = Path.cwd() / 'data/PushShiftAndRedditAPICrawler-output'
submissions_comments_files = list(data_dir.glob('*.json'))
submissions_comments_files[:3]

[WindowsPath('D:/gatech/courses/cs-6471-computational-social-science/project/utils/reddit/data/PushShiftAndRedditAPICrawler-output/q=privacy+policy&before=1546923600&after=1546318800&size=100&fields=id-t3_abhq6i.json'),
 WindowsPath('D:/gatech/courses/cs-6471-computational-social-science/project/utils/reddit/data/PushShiftAndRedditAPICrawler-output/q=privacy+policy&before=1546923600&after=1546318800&size=100&fields=id-t3_abi6je.json'),
 WindowsPath('D:/gatech/courses/cs-6471-computational-social-science/project/utils/reddit/data/PushShiftAndRedditAPICrawler-output/q=privacy+policy&before=1546923600&after=1546318800&size=100&fields=id-t3_abiah9.json')]

In [6]:
submissions = []

for f in submissions_comments_files:
    data: Dict[str, Any] = json.load(
        f.open('r', encoding='utf-8')
    )
    topic = parse.parse_qs(f.stem)['q'][0]
    submission = data['submission']['data']
    comments = data['comments']

    submissions.append(Submission(
        is_self=submission.get('is_self'),
        name=submission.get('name'),
        selftext=submission.get('selftext'),
        subreddit_id=submission.get('subreddit_id'),
        author_fullname=submission.get('author_fullname'),
        created=int(submission.get('created')),
        topic=topic
    ))


In [7]:
submissions = pd.DataFrame(submissions)
submissions.head()

Unnamed: 0,is_self,name,selftext,subreddit_id,author_fullname,created,topic
0,True,t3_abhq6i,[deleted],t5_2qhlc,,1546344380,privacy policy
1,True,t3_abi6je,This post will bring me a lot of downvotes but...,t5_2tk0s,t2_god9h,1546349238,privacy policy
2,True,t3_abiah9,[deleted],t5_2qh1q,,1546350396,privacy policy
3,False,t3_abltgc,,t5_skup3,t2_rln9c,1546375647,privacy policy
4,True,t3_ablxx1,\n\n￼\n\nLATEST NEWS\n\nFirewall Zero Hour Dev...,t5_9krdo,t2_1alkf8uj,1546376448,privacy policy


In [10]:
df_submissions_selftext: pd.DataFrame = submissions[(submissions['selftext'] != "") & (submissions['selftext'] != "[deleted]") & (submissions['selftext'] != "[removed]")]
df_submissions_selftext.head()

Unnamed: 0,is_self,name,selftext,subreddit_id,author_fullname,created,topic
1,True,t3_abi6je,This post will bring me a lot of downvotes but...,t5_2tk0s,t2_god9h,1546349238,privacy policy
4,True,t3_ablxx1,\n\n￼\n\nLATEST NEWS\n\nFirewall Zero Hour Dev...,t5_9krdo,t2_1alkf8uj,1546376448,privacy policy
6,True,t3_abm5w0,Let's narrow the focus of r/HailData to\n\n***...,t5_3nvqv,t2_oimkt,1546377847,privacy policy
8,True,t3_abnffw,"So, a bit of backstory: \n\nI am a violinist, ...",t5_2xhvq,t2_aee7w,1546386129,privacy policy
9,True,t3_abog2j,[Cryptology ePrint Archive: Report 2018/415](h...,t5_se72n,t2_hj8v9,1546393052,privacy policy


In [12]:
selftexts = df_submissions_selftext['selftext'].tolist()
selftexts[0]

"This post will bring me a lot of downvotes but my need to express myself surpasses those.\n\nThe whole thing started with writing to discord support why the nitro (now nitro classic) is so expensive when this option came out. They said because its new... and wanted if i remember correctly 10$\\~\n\n&#x200B;\n\nNow after all this time they still want 60$ a year (12 x 5$\\~) that a price for a one time Triple A-Game/software titel payment (like red redemption 2 and so on). As for discord you get the feature FullHD( I shorten it to FH) screen share (all other program that support screenshare offer it for free for non commercial use, skype, team-viewer etc. !). custom emotes and uploading cap to 50 mb? Even telegram doing it for free. (I have no experience with whatsapp, snapchat or others) as for the rest of the nitro classic feature they aren’t even worth to mention for this pricetag. And for this pricetag they don’t even offer to stop collecting your data or stop keeping logs! As for t