# Exploring Crawled Reddit Data

In [1]:
import json
import locale

import os
import sys
from pathlib import Path
from os import PathLike

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# time
import time
from datetime import datetime

# typing
from typing import List, Dict, Set, Union, Any

# gensim
import gensim
import gensim.corpora

# jupyter
from tqdm import tqdm
tqdm.pandas()

%load_ext autoreload
%autoreload 2

# we will be using utf-8 for io encoding decoding
os.environ['PYTHONUTF8'] = '1'

In [2]:
# some additional configs, pertains to how certain cells are run/behave
write_to_db = False
db_config = {
    'host': '10.20.4.246',
    'port': 3306,
    'user': 'sandman',
    'database': 'cs6471'
}

In [3]:
Path.cwd()

WindowsPath('D:/gatech/courses/cs-6471-computational-social-science/project/utils/reddit')

In [4]:
data_dir = Path.cwd() / 'data/PushShiftAndRedditAPICrawler-output'
data_files = list(data_dir.glob('*.json'))

In [5]:
# load submissions and comments from json files
all_submissions = []
all_comments = []

for f in tqdm(data_files):
    f: Path
    data = json.load(f.open(mode='r', encoding='utf-8'))
    submission = data['submission']['data']

    # data['comments'] = {'type': 't1', 'data': 'actual comment data object'}
    comments = map(lambda c: c['data'], data['comments'])

    all_submissions.append(submission)
    all_comments.extend(comments)

100%|██████████| 22767/22767 [01:32<00:00, 247.29it/s] 


In [6]:
used_columns_df_submissions = [
    'author',
    'author_fullname',
    'created_utc',  # float64
    'name',  # text used as fullname
    'num_comments',  # int64
    'num_crossposts',  # int64
    'num_duplicates',  # int64
    'score',  # int64 num upvotes
    'selftext',  # longtext
    'subreddit_id',  # text fullname of subreddit
    'subreddit_name_prefixed',  # text prefixed name of the subreddit
    'subreddit_subscribers',  # 'int64'
    'subreddit_type',  # text
    'title',  # text
]
df_submissions = pd.DataFrame(all_submissions)[used_columns_df_submissions]
df_submissions.head()

Unnamed: 0,author,author_fullname,created_utc,name,num_comments,num_crossposts,num_duplicates,score,selftext,subreddit_id,subreddit_name_prefixed,subreddit_subscribers,subreddit_type,title
0,[deleted],,1546344000.0,t3_abhq6i,2,0,0,3,[deleted],t5_2qhlc,r/privacy,1241659,public,Anyone tried Jami Messenger?
1,amboris,t2_god9h,1546349000.0,t3_abi6je,9,0,0,5,This post will bring me a lot of downvotes but...,t5_2tk0s,r/unpopularopinion,2347620,public,This is why i hate Discord and you should to! :/
2,[deleted],,1546350000.0,t3_abiah9,43,0,0,218,[deleted],t5_2qh1q,r/india,748669,public,Ola's Privacy Policy is creepy
3,EndMeetsEnd,t2_rln9c,1546376000.0,t3_abltgc,0,0,1,1,,t5_skup3,r/TheLibertarianProject,2,public,Data Privacy Scandals and Public Policy Pickin...
4,Armane407,t2_1alkf8uj,1546376000.0,t3_ablxx1,1,0,0,0,\n\n￼\n\nLATEST NEWS\n\nFirewall Zero Hour Dev...,t5_9krdo,r/FireWallZeroHour,5452,public,Nice to know


In [7]:
used_columns_df_comments = [
    'author',
    'author_fullname',
    'body',  # longtext content of the comment
    'controversiality',  # float64
    'created_utc',  # float64
    'depth',  # int
    'name',  # text fullname of the comment
    'parent_id',  # text fullname of the parent
    'score',  # int64
    'subreddit_id',  # text fullname of the subreddit
    'subreddit_name_prefixed',
]
df_comments = pd.DataFrame(all_comments)[used_columns_df_comments]
df_comments.head()

Unnamed: 0,author,author_fullname,body,controversiality,created_utc,depth,name,parent_id,score,subreddit_id,subreddit_name_prefixed
0,[deleted],,[deleted],0.0,1546554000.0,0,t1_ed6jj48,t3_abhq6i,5.0,t5_2qhlc,r/privacy
1,tedkotz,t2_hnvc9,They are not a service provider. That is proba...,0.0,1554926000.0,0,t1_ekknevu,t3_abhq6i,1.0,t5_2qhlc,r/privacy
2,AutoModerator,t2_6l4z3,Hi everyone! Please make sure to **upvote** we...,0.0,1546349000.0,0,t1_ed0ev41,t3_abi6je,1.0,t5_2tk0s,r/unpopularopinion
3,Flamingpanda2000,t2_1bd437bt,Nobody’s going to downvote your opinion becaus...,0.0,1546350000.0,0,t1_ed0f60a,t3_abi6je,8.0,t5_2tk0s,r/unpopularopinion
4,Vic9420,t2_mlbi9,"I use Discord,but only as a way to chat with a...",0.0,1546350000.0,0,t1_ed0f89z,t3_abi6je,2.0,t5_2tk0s,r/unpopularopinion


In [8]:
if write_to_db:
    # get a database connection
    import sqlalchemy
    import sqlalchemy.dialects.mysql
    from sqlalchemy.dialects.mysql import TEXT, LONGTEXT, BIGINT, DOUBLE

    print(f'Using SQL DB, config: {db_config}')
    engine = sqlalchemy.create_engine(
        f'mysql+mysqlconnector://{db_config["user"]}@{db_config["host"]}:{db_config["port"]}/{db_config["database"]}',
        echo=False)
    print(vars(engine))
    print('Writing submissions dataframe to DB')

    # self text is very long, we need to explicitly set this to longtext
    df_submissions.to_sql(
        name='submissions',
        con=engine,
        if_exists='replace',
        method='multi',
        chunksize=512,
        dtype={
            'author': TEXT,
            'author_fullname': TEXT,
            'created_utc': BIGINT,  # float64
            'name': TEXT,  # text used as fullname
            'num_comments': BIGINT,  # int64
            'num_crossposts': BIGINT,  # int64
            'num_duplicates': BIGINT,  # int64
            'score': BIGINT,  # int64 num upvotes
            'selftext': LONGTEXT,  # longtext
            'subreddit_id': TEXT,  # text fullname of subreddit
            'subreddit_name_prefixed': TEXT,  # text prefixed name of the subreddit
            'subreddit_subscribers': BIGINT,  # 'int64'
            'subreddit_type': TEXT,  # text
            'title': TEXT,  # text
        }
    )
    print(f'Writing comments to DB')
    # body is very long, we need to explicitly set this to longtext
    df_comments.to_sql(
        name='comments',
        con=engine,
        if_exists='replace',
        method='multi',
        chunksize=512,
        dtype={
            'author': TEXT,
            'author_fullname': TEXT,
            'body': LONGTEXT,  # longtext content of the comment
            'controversiality': DOUBLE,  # float64
            'created_utc': BIGINT,  # float64
            'depth': BIGINT,  # int
            'name': TEXT,  # text fullname of the comment
            'parent_id': TEXT,  # text fullname of the parent
            'score': BIGINT,  # int64
            'subreddit_id': TEXT,  # text fullname of the subreddit
            'subreddit_name_prefixed': TEXT,
        }
    )

## Submissions with content (selftext is not empty, deleted or removed

In [9]:
#
df_contented_submissions: pd.DataFrame
df_contented_submissions = df_submissions[
    (df_submissions['selftext'] != '[deleted]') & (df_submissions['selftext'] != '[removed]') & (df_submissions['selftext'] != '')
]
df_contented_submissions.head()

Unnamed: 0,author,author_fullname,created_utc,name,num_comments,num_crossposts,num_duplicates,score,selftext,subreddit_id,subreddit_name_prefixed,subreddit_subscribers,subreddit_type,title
1,amboris,t2_god9h,1546349000.0,t3_abi6je,9,0,0,5,This post will bring me a lot of downvotes but...,t5_2tk0s,r/unpopularopinion,2347620,public,This is why i hate Discord and you should to! :/
4,Armane407,t2_1alkf8uj,1546376000.0,t3_ablxx1,1,0,0,0,\n\n￼\n\nLATEST NEWS\n\nFirewall Zero Hour Dev...,t5_9krdo,r/FireWallZeroHour,5452,public,Nice to know
6,grantph,t2_oimkt,1546378000.0,t3_abm5w0,0,0,0,24,Let's narrow the focus of r/HailData to\n\n***...,t5_3nvqv,r/HailData,2423,public,The Future of HailData (2019)
8,avacash,t2_aee7w,1546386000.0,t3_abnffw,40,0,0,1,"So, a bit of backstory: \n\nI am a violinist, ...",t5_2xhvq,r/AmItheAsshole,3659503,public,AITA for getting mad when people compliment my...
9,dj-gutz,t2_hj8v9,1546393000.0,t3_abog2j,0,0,0,0,[Cryptology ePrint Archive: Report 2018/415](h...,t5_se72n,r/myrXiv,16,public,Flux: Revisiting Near Blocks for Proof-of-Work...


### Dumping selftext to file

In [10]:
dump_to_file = True # toggle this
if dump_to_file:

    save_dir = Path.cwd() / 'data/all_selftexts'

    def dumps(row: pd.Series):
        save_filename = f'contented-authorfullname_{row.get("author_fullname", "")}-createdutc_{row.get("created_utc", "")}-name_{row.get("name", "")}-subredditid_{row.get("subreddit_id", "")}.txt'
        (save_dir / save_filename).write_text(row.get('selftext', ''), encoding='utf-8')

    if not save_dir.exists():
        save_dir.mkdir()
    df_contented_submissions.progress_apply(dumps, axis=1)

100%|██████████| 13698/13698 [00:03<00:00, 3483.35it/s]


## Submissions within a time range

In [11]:
time_range = (
    datetime(year=2020, month=1, day=1, hour=0, minute=0, second=0, microsecond=0).timestamp(),
    datetime(year=2021, month=1, day=1, hour=0, minute=0, second=0, microsecond=0).timestamp()
)
target_df = df_contented_submissions
time_mask = (target_df['created_utc'] >= time_range[0]) & (target_df['created_utc'] < time_range[1])
target_df = target_df[time_mask]
target_df.head()

Unnamed: 0,author,author_fullname,created_utc,name,num_comments,num_crossposts,num_duplicates,score,selftext,subreddit_id,subreddit_name_prefixed,subreddit_subscribers,subreddit_type,title
3935,SensibleHumanBeing,t2_3m9wl1fx,1577862000.0,t3_eiff52,1,0,0,2,What is happening here? I want to read it but ...,t5_2b23q1,r/Fruitlab,177,public,Why does the privacy policy seemingly just not...
3937,PPCexpo-us,t2_4aa5k9kq,1577871000.0,t3_eigmr6,0,0,0,13,"In 2013, millennials accounted for $1.3 trilli...",t5_2qvdk,r/PPC,115823,public,5 Core Aspects of an Effective Marketing Strat...
3938,PPCexpo-us,t2_4aa5k9kq,1577872000.0,t3_eigqx2,5,0,0,13,"In 2013, millennials accounted for $1.3 trilli...",t5_2s3d6,r/DigitalMarketing,76240,public,5 Core Aspects of an Effective Marketing Strat...
3939,Highlow9,t2_ehtjf,1577893000.0,t3_eijj0h,8,0,0,9,So I like saving money without effort but I al...,t5_2qhlc,r/privacy,1241660,public,How much data is collected by Honey?
3942,DangerDylan,t2_ovgqf,1577909000.0,t3_eimwv0,0,0,0,2,##[World News](/r/worldnews/)\n\n**Taiwan Lead...,t5_mirq9,r/DangerDylanTLDR,149,restricted,"[Wednesday, 01. January]"


### Playing with this dataframe

In [12]:
target_df_selftexts = target_df['selftext'].tolist()
target_df_selftexts[:3]

['What is happening here? I want to read it but the link is dead,',
 'In 2013, millennials accounted for $1.3 trillion in consumer spending. Six years on, this bracket of 18-34 year-olds are the largest demographic that marketers must consider. Therefore, if you don’t have an effective marketing strategy, your business could miss out on a lot of potential customers.\n\nAside from their numbers, Millennials have a very different mindset to previous generations, and so their choices in regards to shopping and spending aren’t as predictable. What worked for Baby Boomers and Gen X consumers doesn’t usually fare so well with Millennials.\n\nWith that in mind, modern marketers must devise an effective marketing strategy that can strike a chord with this tech-savvy demographic. By realizing how they embrace social media, you can leverage tap into the potential this huge audience offers.\n\nIn this article, we’ll show you the key areas you should focus on in the modern age.\n\n## 1. Think Diff

## Topic Modeling with Gensim

Selftext often contain markdown elements and other things that are not words (example: emojis and url links). These are generally not meaningful if we are not using a contextualized word embedding. We can add pipelines in gensim to filter out these words. (TODO)

### Processing all selftext together
We can also do rolling selftext by the month/week

In [13]:
# load all selftext into gensim
all_selftext_dir = Path.cwd() / 'data/all_selftexts'
corpus = gensim.corpora.TextDirectoryCorpus(str(all_selftext_dir))