In [1]:
import numpy as np
import pandas as pd
import re
import spacy
import time

from empath import Empath

[`re`](https://docs.python.org/3/library/re.html) is a library for using regular expressions (patterns of characters used to match more varied strings)

In [2]:
df = pd.read_json("rjobs_2020_raw.json")

In [3]:
# df.head()

In [4]:
df.shape

(70711, 91)

### Keeping posts that have selftext and meet other criteria

In [5]:
df = df[(df["locked"]==False) & (df["selftext"] != "[removed]") & (df["selftext"] != "[deleted]")
       & (~df["selftext"].isnull()) & (df["is_self"]==True) & (df["is_video"]==False) & (df["pinned"]==False)
       & (df["stickied"]==False)]

In [6]:
df.shape

(49872, 91)

### Converting "created_utc" to the date and then creating different time variables

In [7]:
df["date"] = pd.to_datetime(df["created_utc"],unit="s") # "created_utc" is in seconds

In [8]:
df["dayofyear"] = df["date"].dt.dayofyear
df["hour"] = df["date"].dt.hour
df["dayofmonth"] = df["date"].dt.day
df["month"] = df["date"].dt.month
df["dayofweek"] = df["date"].dt.dayofweek
df["week"] = df["date"].dt.week
df["day_name"] = df["date"].dt.day_name()
df["month_name"] = df["date"].dt.month_name()

  df["week"] = df["date"].dt.week


### Randomly shuffling the data and then creating anonymized author IDs

I already anonymized these (the "author_pseudo" field) but this shows the process.

In [9]:
df = df.sample(frac=1.0) # this random "sample" is 100% of the data points, just in a different order

authors = set(df["author_pseudo"]) # set of all the unique author usernames

author_ids = np.random.randint(0,1000000, len(authors)) # random numbers between 0 and 1,000,000
author_ids = author_ids.tolist() # convert it from an array to a list
while len(set(author_ids)) < len(authors): # because the numbers are random, some may be duplicates
    author_ids = list(set(author_ids)) # cast it to set to nix duplicates, then back to list
    new = np.random.randint(0,100000) # get one new random number
    new_idx = np.random.randint(0,len(author_ids)-1) # get one random index
    author_ids.insert(new_idx, new) # insert the new random number at the random index
    
# the while loop will keep going until there are the same number of unique IDs as there are unique authors

author_id = [f"{id_:0>6}" for id_ in author_ids] # convert them to strings with leading zeros, e.g. '000001'
author_id_dict = {author:author_id for author, author_id in zip(authors, author_ids)} # dict mapping authors to IDs

df["author_id"] = df["author_pseudo"].apply(lambda x: author_id_dict[x]) # create new variable, anonymizing authors

# we drop the author column later

In [10]:
assert len(set(authors)) == len(set(author_ids))

### Creating the "text" field by merging the title and selftext

In [11]:
df["text"] = df.apply(lambda row: row["title"] + "\n " + row["selftext"], axis = 1)

### Preprocessing

In [12]:
def preprocess_post(post: str) -> str:
    """
    Tokenize, lemmatize, remove stop words, 
    remove non-alphabetic characters.
    """
    post = " ".join([word.lemma_ for word in nlp(post) if not word.is_stop])
    post = re.sub("[^a-z]", " ", post.lower())
    return re.sub("\s+", " ", post).strip()


nlp = spacy.load("en_core_web_sm", disable=["ner"])

In [13]:
example = df.sample(1)["text"].values[0]

# df.sample(1) takes 1 row from the data frame at random
# ["text"] selects the "text" field (which is the combination of the "title" and "selftext" fields created above)
# .values turns it into a vector of strings, in this case just the one
# [0] takes the first element in the vector (which is also the only element)
# now we have the string for the text instead of a dataframe or some kind of vector

print(example)

Should I quit my very first job after working their for less than a few days
 I've recently got a job at Joann's (fabric and craft store) and I absolutely hate my job, being a stocker - and hate the store. The store is ran-down and dirty, and everybody who I deal with are older lady's and  Karens. Im the youngest person (only 16) at the store and don't really in there, however Im paid more than minimum wage (Im paid $9.00 while minimum wage is $8.25) Do you think it will be worth it or would nobody want to hire me because I quit my very first job within a few days? any advice helps, thank you.


In [14]:
for word in nlp(example):
    print(f"word: {word.text} | lemma: {word.lemma_} | part of speech: {word.pos_}")

word: Should | lemma: should | part of speech: AUX
word: I | lemma: I | part of speech: PRON
word: quit | lemma: quit | part of speech: VERB
word: my | lemma: my | part of speech: PRON
word: very | lemma: very | part of speech: ADV
word: first | lemma: first | part of speech: ADJ
word: job | lemma: job | part of speech: NOUN
word: after | lemma: after | part of speech: ADP
word: working | lemma: work | part of speech: VERB
word: their | lemma: their | part of speech: PRON
word: for | lemma: for | part of speech: ADP
word: less | lemma: less | part of speech: ADJ
word: than | lemma: than | part of speech: SCONJ
word: a | lemma: a | part of speech: DET
word: few | lemma: few | part of speech: ADJ
word: days | lemma: day | part of speech: NOUN
word: 
  | lemma: 
  | part of speech: SPACE
word: I | lemma: I | part of speech: PRON
word: 've | lemma: 've | part of speech: AUX
word: recently | lemma: recently | part of speech: ADV
word: got | lemma: get | part of speech: VERB
word: a | lemma:

In [15]:
print(preprocess_post(example))

quit job work day recently get job joann fabric craft store absolutely hate job stocker hate store store run dirty everybody deal old lady karens m young person store m pay minimum wage m pay minimum wage think worth want hire quit job day advice help thank


"<tt>nlp</tt>" is a language model from [spaCy](https://spacy.io/). It does part-of-speech tagging, named entity recognition, and more. `disable=["ner"]` tells it not to perform named entity recognition. Turning things off might speed it up

The function <tt>preprocess_post</tt> is equivalent to the following:

```python
def preprocess_post(post: str) -> str:
    """
    Tokenizes and returns the lowercase lemmas of
    tokens that are not stop words, minus any 
    non-alphabetic characters
    """
    words = []
    for word in nlp(post): # each "word" in nlp(post) has been part-of-speech tagged, etc.
        if not word.is_stop: # ".is_stop" checks whether spacy has determined it's a stop word
            words.append(word.lemma_) # adding the lemma of the word, not the word itself, to the list
    post = " ".join(words) # converting the list of words to a string variable separated by spaces
    post = post.lower() # make everything lowercase
    post = re.sub("[^a-z]", " ", post) # now we replace non-alphabetic chars with spaces
    post = re.sub("\s+", " ", post) # now we replace long stretches of whitespace with a single space
    post = post.strip() # now we strip whitespace from the edges
    return post
```

In [16]:
start_time = time.time()

df["preprocessed"] = df["text"].apply(preprocess_post)

print(f"Finished preprocessing {df.shape[0]} posts in {(time.time()-start_time)/60:.1f} minutes")

Finished preprocessing 49872 posts in 12.6 minutes


(This took about twice as long in Windows, for what that's worth)

### Calculating scores for each dictionary from Empath for each post in corpus

This calls lexicon.analyze() on each preprocessed post. lexicon.analyze() returns a dictionary with lexical categories as keys and a post's score as the value for each. This creates a column (variable) for each key and populates it with each post's score.

In [17]:
start_time = time.time()

lexicon = Empath()

df[list(lexicon.cats)] = df["preprocessed"].apply(lambda x: pd.Series(lexicon.analyze(x, normalize=True)))

print(f"Analyzed all posts in {(time.time()-start_time)/60:.1f} minutes")

  df[list(lexicon.cats)] = df["preprocessed"].apply(lambda x: pd.Series(lexicon.analyze(x, normalize=True)))


Analyzed all posts in 7.1 minutes


A bit of Googling suggests .apply() is slow. Other methods also give warnings, but I'll look into alternatives.

### That's it!

Now we just get the subset of columns we want and export the whole dataframe to a JSON file.

In [23]:
df["id"] = [f"{i:0>5}" for i in range(df.shape[0])] # create an ID as a string, based on order

In [24]:
cols = ["id", "author_id", "score", "num_comments", "title", "selftext", "text", "preprocessed", "date", "dayofyear", "hour", "dayofmonth", "month", "dayofweek", "week", "day_name", "month_name"]
cols += sorted(list(lexicon.cats.keys())) # we add the categories from Empath so we keep the columns from that, too

In [25]:
df = df[cols]

In [26]:
df.head()

Unnamed: 0,id,author_id,score,num_comments,title,selftext,text,preprocessed,date,dayofyear,...,wealthy,weapon,weather,wedding,white_collar_job,work,worship,writing,youth,zest
44252,0,651809,1,3,"how am i supposed to find ""new"" ways of doing ...","so, i just graduated and I got the role as an ...","how am i supposed to find ""new"" ways of doing ...",suppose find new way thing graduate get role o...,2020-08-02 15:12:52,215,...,0.025,0.0,0.0,0.0,0.025,0.025,0.0,0.0,0.0,0.0
55101,1,939708,1,17,Everything is horrible right now,I feel horrible. I feel like I should have nev...,Everything is horrible right now\n I feel horr...,horrible right feel horrible feel like get bac...,2020-09-29 01:02:56,273,...,0.029412,0.0,0.014706,0.014706,0.073529,0.147059,0.0,0.014706,0.0,0.0
62790,2,368221,1,5,WGAT SHOULD I DO,"I work with 2 girls, that are younger than ...","WGAT SHOULD I DO\n I work with 2 girls, tha...",wgat work girl young clean roof gutter story h...,2020-11-12 20:27:03,317,...,0.030303,0.0,0.0,0.0,0.0,0.060606,0.0,0.0,0.060606,0.0
8254,3,543188,1,5,I really want this job and today they will con...,What can background checks find?\n\nRecently g...,I really want this job and today they will con...,want job today confirm question background che...,2020-02-07 15:58:03,38,...,0.0,0.017544,0.0,0.0,0.070175,0.105263,0.0,0.0,0.017544,0.0
23535,4,330821,1,1,Help big time,Okay guys just did a interview online via vide...,Help big time\n Okay guys just did a interview...,help big time okay guy interview online video ...,2020-04-25 10:37:05,116,...,0.0,0.0,0.0,0.0,0.04,0.12,0.0,0.04,0.0,0.0


In [22]:
df.to_csv("rjobs_2020_cleaned.json")