# Preprocessing

Imports

In [1]:
import requests
import pandas as pd
import numpy as np
import time
import nltk
import re
import codecs
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

Pushshift Params

In [2]:
def get_subreddit_data(subreddit,epoch_time):
    url =f'https://api.pushshift.io/reddit/search/submission?subreddit={subreddit}&author!=[deleted]&size=500&is_self=true&before={epoch_time}'
    res = requests.get(url)
    data = res.json()
    return data['data']

In [3]:
def exist_keys(post_to_check):
    if ("author" in post_to_check and "selftext" in post_to_check and "is_self" in post_to_check):
        return True
    else:
        return False

In [4]:
def check_post(post_to_check):
    if exist_keys(post_to_check):
        author = post_to_check['author']
        selftext = post_to_check['selftext']
        is_self = post_to_check['is_self']
        if (author != '[deleted]' and author != 'deleted' and author != 'removed' 
                and selftext != 'removed' and selftext != ""
                and selftext != 'deleted' and 50 < len(selftext) < 50000
                and "http://" not in selftext and "https://" not in selftext
                and is_self) :
            return True
        else:
            return False
    else:
        return False

In [5]:
def get_filtered_posts(subreddit, post_count):
    result = []
    epoch_time = int(time.time())
    is_end_of_topic = False
    while len(result) <= post_count and not is_end_of_topic:
        post_list = get_subreddit_data(subreddit, epoch_time)
        temp_result = [post for post in post_list if check_post(post)]
        result.extend(temp_result)
        if epoch_time != int(result[-1]['created_utc']):
            epoch_time = int(result[-1]['created_utc'])
        else:
            is_end_of_topic = True
    return result

In [6]:
biology_posts = get_filtered_posts("biology", 3000)
print('We have',len(biology_posts), 'titles in the data')

We have 3025 titles in the data


In [7]:
physics_posts = get_filtered_posts("physics", 3000)
print('We have',len(physics_posts), 'titles in the data')

We have 3038 titles in the data


In [8]:
bio_df = pd.DataFrame(biology_posts)
phys_df = pd.DataFrame(physics_posts)

In [9]:
bio_df = bio_df[['title', 'selftext', 'score', 'created_utc']]

In [10]:
phys_df = phys_df[['title', 'selftext', 'score', 'created_utc']]

In [11]:
def standardize_text(df, text_field):
    df['title'] = df['title'].str.replace(r"http\S+", "")
    df['title'] = df['title'].str.replace(r"http", "")
    df['title'] = df['title'].str.replace(r"@\S+", "")
    df['title'] = df['title'].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df['title'] = df['title'].str.replace(r"@", "at")
    df['title'] = df['title'].str.lower()
    return df

bio_df = standardize_text(bio_df, 'title')
phys_df = standardize_text(phys_df, 'title')

  df['title'] = df['title'].str.replace(r"http\S+", "")
  df['title'] = df['title'].str.replace(r"@\S+", "")
  df['title'] = df['title'].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")


In [12]:
bio_df = standardize_text(bio_df, 'title')
bio_df.head()

  df['title'] = df['title'].str.replace(r"http\S+", "")
  df['title'] = df['title'].str.replace(r"@\S+", "")
  df['title'] = df['title'].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")


Unnamed: 0,title,selftext,score,created_utc
0,swine flu,Younger people were more at risk of becoming i...,1,1610049677
1,sapolsky how good is he?,So I just discovered this series of Stanford l...,0,1610047606
2,cheap enzyme suppliers?,"I had an idea for a project, but the idea is c...",1,1610041771
3,kinda urgent,I’ve had an unfortunate few months and I have ...,1,1610041415
4,how long does it take to get thyroid levels ba...,So I have had iron deficiency and got 500mg IV...,1,1610033759


In [13]:
phys_df = standardize_text(phys_df, 'title')
phys_df.head()

  df['title'] = df['title'].str.replace(r"http\S+", "")
  df['title'] = df['title'].str.replace(r"@\S+", "")
  df['title'] = df['title'].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")


Unnamed: 0,title,selftext,score,created_utc
0,please help with this problem,A 2.20 kg red ball moving to the right at 19.1...,1,1610045825
1,buffer stop problem,I was challenged by My RS Teacher to find out ...,1,1610043853
2,why does voltage increase cause a current decr...,"Now, I understand that having less secondary t...",1,1610043191
3,how should a computer scientist learn physics?,"Hi guys, I am studying Computer Science pursui...",1,1610042001
4,can someone help me solve this question plz,A parallel plate capacitor with circular pla...,1,1610039447


In [29]:
# Major credit to 
# https://towardsdatascience.com/topic-modeling-quora-questions-with-lda-nmf-aff8dce5e1dd
    
import spacy
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, 
    remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub('[^\w\s]','', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

bio_clean = pd.DataFrame(bio_df.title.apply(lambda x: clean_text(x)))
phys_clean = pd.DataFrame(phys_df.title.apply(lambda x: clean_text(x)))

nlp = spacy.load('en_core_web_sm')

def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)
    

In [30]:
bio_clean['title'] = bio_clean.apply(lambda x: lemmatizer(x['title']), axis=1)
bio_clean['title'] = bio_clean['title'].str.replace('-PRON-', '')

phys_clean['title'] = phys_clean.apply(lambda x: lemmatizer(x['title']), axis=1)
phys_clean['title'] = phys_clean['title'].str.replace('-PRON-', '')

In [32]:
type(bio_clean)

pandas.core.frame.DataFrame

In [16]:
bio_clean.isnull().sum()

title    0
dtype: int64

In [17]:
phys_clean.head()

Unnamed: 0,title
0,please help with this problem
1,buffer stop problem
2,why do voltage increase cause a current decrea...
3,how should a computer scientist learn physics
4,can someone help solve this question plz


In [18]:
phys_clean.isnull().sum()

title    0
dtype: int64

In [33]:
bio_clean.to_csv('datasets/bio_clean.csv', index=False)
phys_clean.to_csv('datasets/phys_clean.csv', index=False)