<font size="6">Classification of Particle Physics and Astrophysics Subreddits</font>

<font size="5"> I. Data Acquisition from Subreddits </font>

In [2]:
import pandas as pd
import numpy as np 
import nltk as nt
from sklearn.pipeline import make_pipeline



# API
import requests

# Automating
import time
import datetime
import warnings
import sys

In [13]:

def get_posts(subreddit, n_iter, epoch_right_now): # subreddit name and number of times function should run
    # store base url variable
    base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=astrophysics'

    # instantiate empty list    
    df_list = []
    
    # save current epoch, used to iterate in reverse through time
    current_time = epoch_right_now
    
    # set up for loop
    for post in range(n_iter):
        
        # instantiate get request
        res = requests.get(
            
            # requests.get takes base_url and params
            base_url,
            
            # parameters for get request
            params = {
                
                # specify subreddit
                'subreddit' : subreddit,
                
                # specify number of posts to pull
                'size' : 100,
                
                # ???
                'lang' : True,
                
                # pull everything from current time backward
                'before' : current_time }
        )
        
        # take data from most recent request, store as df
        df = pd.DataFrame(res.json()['data'])
        
        # pull specific columns from dataframe for analysis
        df = df.loc[:, ['title',
                        'created_utc', 
                        'selftext',
                        'subreddit',
                        'author',
                        'media_only',
                        'permalink']]
        
        # append to empty dataframe list
        df_list.append(df)
        
        # add wait time
        time.sleep(20)
        
        # set current time counter back to last epoch in recently grabbed df
        current_time = df['created_utc'].min()

    # return one dataframe for all requests
    return pd.concat(df_list, axis=0)
 

In [14]:
astrophysics = get_posts('astrophysics',5,1602017032) # iteratively acquire data 5 times at specified timestamp

In [7]:
astrophysics # astrophysics dataframe

Unnamed: 0,title,created_utc,selftext,subreddit,author,media_only,permalink
0,Gorgeous NASA X-ray images of universe look li...,1602008143,,astrophysics,Sorin61,False,/r/astrophysics/comments/j6aaro/gorgeous_nasa_...
1,Can someone please explain how to calculate th...,1602006466,[removed],astrophysics,astrojosue,False,/r/astrophysics/comments/j69qx5/can_someone_pl...
2,Can someone explain formula to calculate the m...,1602000784,,astrophysics,astrojosue,False,/r/astrophysics/comments/j67wz2/can_someone_ex...
3,Can someone elaborate?,1601999794,,astrophysics,astrojosue,False,/r/astrophysics/comments/j67lw5/can_someone_el...
4,Maybe astrophysics is not for me! Just watched...,1601987660,,astrophysics,Yugitonii,False,/r/astrophysics/comments/j6485v/maybe_astrophy...
...,...,...,...,...,...,...,...
95,4th dimension world within worlds.,1588447182,,astrophysics,brutus1964,False,/r/astrophysics/comments/gcc508/4th_dimension_...
96,Becoming an Astrophysics,1588443329,Hello! I have seen some other posts in this su...,astrophysics,StupendousTurtle,False,/r/astrophysics/comments/gcb2d3/becoming_an_as...
97,"How often do we see supernovas, whether in thi...",1588409684,[removed],astrophysics,RealJackKevorkian,False,/r/astrophysics/comments/gc3bip/how_often_do_w...
98,"Astronomers may have caught an incredible, onc...",1588404247,,astrophysics,PopescuG,False,/r/astrophysics/comments/gc2dvd/astronomers_ma...


In [10]:
def get_posts(subreddit, n_iter, epoch_right_now): # subreddit name and number of times function should run
    # store base url variable
    base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit=ParticlePhysics'

    # instantiate empty list    
    df_list = []
    
    # save current epoch, used to iterate in reverse through time
    current_time = epoch_right_now
    
    # set up for loop
    for post in range(n_iter):
        
        # instantiate get request
        res = requests.get(
            
            # requests.get takes base_url and params
            base_url,
            
            # parameters for get request
            params = {
                
                # specify subreddit
                'subreddit' : subreddit,
                
                # specify number of posts to pull
                'size' : 100,
                
                # ???
                'lang' : True,
                
                # pull everything from current time backward
                'before' : current_time }
        )
        
        # take data from most recent request, store as df
        df = pd.DataFrame(res.json()['data'])
        
        # pull specific columns from dataframe for analysis
        df = df.loc[:, ['title',
                        'created_utc', 
                        'selftext',
                        'subreddit',
                        'author',
                        'media_only',
                        'permalink']]
        
        # append to empty dataframe list
        df_list.append(df)
        
        # add wait time
        time.sleep(20)
        
        # set current time counter back to last epoch in recently grabbed df
        current_time = df['created_utc'].min()

    # return one dataframe for all requests
    return pd.concat(df_list, axis=0)

In [11]:
particlephysics = get_posts('ParticlePhysics',5,1602017032)# iteratively acquire data 5 times at specified timestamp

In [12]:
particlephysics #particlephysics dataframe

Unnamed: 0,title,created_utc,selftext,subreddit,author,media_only,permalink
0,XOOPIC,1601998656,I’m considering doing some particle in cell si...,ParticlePhysics,7434328982,False,/r/ParticlePhysics/comments/j6793l/xoopic/
1,First observation of time-dependent CP violati...,1601994535,,ParticlePhysics,dukwon,False,/r/ParticlePhysics/comments/j660u6/first_obser...
2,A Synthetic Macroscopic Magnetic Unipole,1601806636,,ParticlePhysics,markoul,False,/r/ParticlePhysics/comments/j4wpup/a_synthetic...
3,Niels Bohr’s representation of a Xenon atom (1...,1601698432,,ParticlePhysics,happypuppy100,False,/r/ParticlePhysics/comments/j49dc0/niels_bohrs...
4,All Things EFT,1601185506,A new online seminar initiative on Effective F...,ParticlePhysics,allthingseft,False,/r/ParticlePhysics/comments/j0lrc1/all_things_...
...,...,...,...,...,...,...,...
95,Copper atoms,1535529373,,ParticlePhysics,waitfof,False,/r/ParticlePhysics/comments/9b79pk/copper_atoms/
96,ATLAS &amp; CMS H→bb̅ observation papers avail...,1535437840,"ATLAS: ""Observation of H→bb̅ decays and VH pro...",ParticlePhysics,dukwon,False,/r/ParticlePhysics/comments/9awkpl/atlas_cms_h...
97,As part of the Perimeter Institute’s ‘My Favou...,1535220159,,ParticlePhysics,IPPP_Durham,False,/r/ParticlePhysics/comments/9a8rex/as_part_of_...
98,Dark Matter Detection with SuperCDMS,1535112573,,ParticlePhysics,IPPP_Durham,False,/r/ParticlePhysics/comments/99wuwf/dark_matter...


In [51]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)

In [61]:
particlephysics

Unnamed: 0,title,created_utc,selftext,subreddit,author,media_only,permalink
0,XOOPIC,1601998656,I’m considering doing some particle in cell simulation using XOOPIC. Anyone has a guide book on ...,ParticlePhysics,7434328982,False,/r/ParticlePhysics/comments/j6793l/xoopic/
1,First observation of time-dependent CP violation in B_s decays,1601994535,,ParticlePhysics,dukwon,False,/r/ParticlePhysics/comments/j660u6/first_observation_of_timedependent_cp_violation/
2,A Synthetic Macroscopic Magnetic Unipole,1601806636,,ParticlePhysics,markoul,False,/r/ParticlePhysics/comments/j4wpup/a_synthetic_macroscopic_magnetic_unipole/
3,Niels Bohr’s representation of a Xenon atom (1923),1601698432,,ParticlePhysics,happypuppy100,False,/r/ParticlePhysics/comments/j49dc0/niels_bohrs_representation_of_a_xenon_atom_1923/
4,All Things EFT,1601185506,A new online seminar initiative on Effective Field Theory in particle physics and beyond has sta...,ParticlePhysics,allthingseft,False,/r/ParticlePhysics/comments/j0lrc1/all_things_eft/
5,【에이스】1등 바카라사이트 | 온라인 카지노 사이트 공식인증업체,1600915577,,ParticlePhysics,odellzssantoyaz,False,/r/ParticlePhysics/comments/iyoske/에이스1등_바카라사이트_온라인_카지노_사이트_공식인증업체/
6,"When talking about the radius of a nucleus, are we talking exclusively about the electric charge...",1600270717,As i understand the radius is defined as the approximate 'edge' of the Saxon-Wood-esque electric...,ParticlePhysics,ChaoticSalvation,False,/r/ParticlePhysics/comments/itxyhq/when_talking_about_the_radius_of_a_nucleus_are_we/
7,How do you start in particle physics,1600138736,"Im a 15 year old that’s interested in particle physics, I dont know alot of specifics on the sub...",ParticlePhysics,Polishrevolution32,False,/r/ParticlePhysics/comments/it0bns/how_do_you_start_in_particle_physics/
8,When will LHC test for extra dimensions by creating mini/micro black holes?,1600116722,Will lhc do such an experiment? And as a side question how are mini artificial black holes forme...,ParticlePhysics,iwannahitthelotto,False,/r/ParticlePhysics/comments/istudb/when_will_lhc_test_for_extra_dimensions_by/
9,Is there any significance to the difference between the charge of quarks (+2/3 and -1/3) being 1...,1599963038,The title might make it seem like I know what I'm talking about. I really don't but I try to lea...,ParticlePhysics,MaesterRigney,False,/r/ParticlePhysics/comments/irprot/is_there_any_significance_to_the_difference/


In [60]:
particlephysics.set_index(pd.Series(list(range(500))), inplace = True) # resetting index to 0 - 499 of particle physics data frame

In [62]:
astrophysics.set_index(pd.Series(list(range(500))), inplace = True) # resetting index 

In [63]:
astrophysics.to_csv('astro') # export as csv

In [64]:
particlephysics.to_csv('particle') # export as csv