# Exploring Crawled Reddit Data

In [1]:
import os
import sys
from pathlib import Path
from os import PathLike

import json
from typing import Dict, Any

from urllib import parse

from datetime import date, datetime

import multiprocessing
import mysql.connector
import mysql.connector.pooling

In [2]:
import dataclasses
from dataclasses import dataclass

In [3]:
@dataclass
class Submission:
    is_self: bool
    name: str
    selftext: str
    subreddit_id: str   # this is actually the full name of the subreddit t5_subredditID
    author_fullname: str    # this is the full name of the author t2_authorID
    created: int
    topic: str  # search query that yielded this submission

In [4]:
data_dir = Path.cwd() / 'data/PushShiftAndRedditAPICrawler-output'
submissions_comments_files = list(data_dir.glob('*.json'))
submissions_comments_files[:3]

[WindowsPath('D:/gatech/courses/cs-6471-computational-social-science/project/utils/reddit/data/PushShiftAndRedditAPICrawler-output/q=privacy+policy&before=1546923600&after=1546318800&size=100&fields=id-t3_abhq6i.json'),
 WindowsPath('D:/gatech/courses/cs-6471-computational-social-science/project/utils/reddit/data/PushShiftAndRedditAPICrawler-output/q=privacy+policy&before=1546923600&after=1546318800&size=100&fields=id-t3_abi6je.json'),
 WindowsPath('D:/gatech/courses/cs-6471-computational-social-science/project/utils/reddit/data/PushShiftAndRedditAPICrawler-output/q=privacy+policy&before=1546923600&after=1546318800&size=100&fields=id-t3_abiah9.json')]

In [11]:
submissions = []

for f in submissions_comments_files:
    data: Dict[str, Any] = json.load(
        f.open('r', encoding='utf-8')
    )
    topic = parse.parse_qs(f.stem)['q'][0]
    submission = data['submission']['data']
    comments = data['comments']

    try:
        submissions.append(Submission(
            is_self=submission.get('is_self'),
            name=submission.get('name'),
            selftext=submission.get('selftext'),
            subreddit_id=submission.get('subreddit_id'),
            author_fullname=submission.get('author_fullname'),
            created=int(submission.get('created')),
            topic=topic
        ))
    except Exception as e:
        print(e)

In [12]:
submissions[0]

Submission(is_self=True, name='t3_abhq6i', selftext='[deleted]', subreddit_id='t5_2qhlc', author_fullname=None, created=1546344380, topic='privacy policy')

In [15]:
# write to database
insert_submission = "insert into submissions (is_self, name, selftext, subreddit_id, author_fullname, created, topic) values (%s, %s, %s, %s, %s, %s, %s)"

connection = mysql.connector.connect(
    user='root',
    host='localhost',
    database='cs6471',
    pool_name='python',
    pool_size=multiprocessing.cpu_count()
)

for s in submissions:
    cursor = connection.cursor()
    cursor.execute(
        insert_submission,
        dataclasses.astuple(s)
    )
    print(f'Written submission {s.name} to database.')
    cursor.close()

connection.commit()
connection.close()

Written submission t3_abhq6i to database.
Written submission t3_abi6je to database.
Written submission t3_abiah9 to database.
Written submission t3_abltgc to database.
Written submission t3_ablxx1 to database.
Written submission t3_abm08t to database.
Written submission t3_abm5w0 to database.
Written submission t3_abm8pa to database.
Written submission t3_abnffw to database.
Written submission t3_abog2j to database.
Written submission t3_abpfe9 to database.
Written submission t3_absycb to database.
Written submission t3_abt5z2 to database.
Written submission t3_abv30r to database.
Written submission t3_abvu1f to database.
Written submission t3_abx9x3 to database.
Written submission t3_abxght to database.
Written submission t3_abxjpw to database.
Written submission t3_abytcj to database.
Written submission t3_ac0yyo to database.
Written submission t3_ac1iet to database.
Written submission t3_ac2q9q to database.
Written submission t3_ac2ypm to database.
Written submission t3_ac4zev to da