In [33]:
from pyarrow import feather
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer, FreqDist
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer

import pyLDAvis.sklearn

from sklearn.decomposition import NMF 
from sklearn.manifold import TSNE

In [121]:
jobs_df_1 = feather.read_feather('results.feather')
jobs_df_2 = feather.read_feather('results2.feather')
jobs_df_3 = feather.read_feather('results3.feather')
jobs_ny_df = feather.read_feather('results_ny.feather')

jobs_df = pd.concat([jobs_df_1, jobs_df_2, jobs_df_3, jobs_ny_df]).drop_duplicates().reset_index()

In [122]:
jobs_df.head()

Unnamed: 0,index,title,company,salary_and_type,desc
0,0,Associate Data Scientist - job post,"T. Rowe Price597 reviewsNew York, NY",Full-time,There is a place for you at T. Rowe Price to...
1,1,Junior Data Scientist - job post,Talentheed Inc10 reviewsRemote•Remote,"$80,000 - $120,000 a year - Full-time",Responsibilities: Work with stakeholders to de...
2,2,Data Scientist (REMOTE) - job post,"Foot Locker7,897 reviewsNew York, NY 10120•Remote",Full-time,Data Scientist (REMOTE) Flexible Store...
3,3,Data Scientist - (Remote) - job post,"Genentech996 reviewsIndianapolis, IN•Remote","$90,000 - $135,000 a year - Full-time",THE POSITION This is a remote opportunity...
4,4,Jr. Data Scientist - job post,Net2AspireRemote•Remote,"$65,000 - $80,000 a year - Full-time, Part-ti...", Apply Statistical and Machine Learning metho...


In [123]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2992 entries, 0 to 2991
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   index            2992 non-null   int64 
 1   title            2992 non-null   object
 2   company          2992 non-null   object
 3   salary_and_type  2992 non-null   object
 4   desc             2992 non-null   object
dtypes: int64(1), object(4)
memory usage: 117.0+ KB


In [133]:
jobs_df['desc'][6]

"Paramount Global (NASDAQ: PARA, PARAA) is a leading global media and entertainment company that creates premium content and experiences for audiences worldwide. Driven by iconic studios, networks and streaming services, Paramount's portfolio of consumer brands includes CBS, Showtime Networks, Paramount Pictures, Nickelodeon, MTV, Comedy Central, BET, Paramount+, Pluto TV and Simon & Schuster, among others. Paramount delivers the largest share of the U.S. television audience and boasts one of the industry's most important and extensive libraries of TV and film titles. In addition to offering innovative streaming services and digital video products, the company provides powerful capabilities in production, distribution and advertising solutions. About Us  Paramount’s Advanced Advertising group is at the forefront of this reinvention of advertising through new business models, predictive analytics, optimization, and other modern data science and analysis techniques. What we're looking fo

In [136]:
ds_jobs_df = jobs_df[jobs_df['title'].str.contains('Data Scientist')]

In [139]:
ds_jobs_df = ds_jobs_df[['desc']]

In [140]:
ds_jobs_df

Unnamed: 0,desc
0,There is a place for you at T. Rowe Price to...
1,Responsibilities: Work with stakeholders to de...
2,Data Scientist (REMOTE) Flexible Store...
3,THE POSITION This is a remote opportunity...
4, Apply Statistical and Machine Learning metho...
...,...
2975,"Akur8 is a young, dynamic, fast growing insu..."
2977,"In today's world, customers expect companies..."
2980,Your Job Delivery Insights provides dev...
2983,About LegalZoom We're here to make legal hel...


In [115]:
from nltk.tokenize import RegexpTokenizer



In [158]:
ds_jobs_df = ds_jobs_df['desc']

In [166]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        
        #define attributes to store if text preprocessing requires fitting from data
        pass
    
    def fit(self, data, y = 0):
        # this is where you would fit things like corpus specific stopwords
        # fit probable bigrams with bigram model in here
        
        # save as parameters of Text preprocessor
        
        basic_token_pattern = r"(?u)\b\w\w+\b"

        self.tokenizer = RegexpTokenizer(basic_token_pattern)
        
        return self
    
    def transform(self, data, y = 0):
        fully_normalized_corpus = data.apply(self.process_doc)
        
        return fully_normalized_corpus
        
    
    def process_doc(self, doc):

        #initialize lemmatizer
        wnl = WordNetLemmatizer()
        stop_words = stopwords.words('english')
        
        # helper function to change nltk's part of speech tagging to a wordnet format.
        def pos_tagger(nltk_tag):
            if nltk_tag.startswith('J'):
                return wordnet.ADJ
            elif nltk_tag.startswith('V'):
                return wordnet.VERB
            elif nltk_tag.startswith('N'):
                return wordnet.NOUN
            elif nltk_tag.startswith('R'):
                return wordnet.ADV
            else:         
                return None


        # remove stop words and punctuations, then lower case
        doc_norm = [tok.lower() for tok in self.tokenizer.tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words)) ]

        #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize

        # creates list of tuples with tokens and POS tags in wordnet format
        wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
        doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]

        return doc_norm

In [167]:
proc = TextPreprocessor()
raw_tokens = proc.fit_transform(ds_jobs_df)

In [179]:
raw_tokens

0       [there, place, rowe, price, grow, contribute, ...
1       [responsibility, work, stakeholder, determine,...
2       [data, scientist, remote, flexible, store, loc...
3       [position, remote, opportunity, opportunity, e...
4       [apply, statistical, machine, learn, method, s...
                              ...                        
2975    [young, dynamic, fast, grow, insurtech, startu...
2977    [today, world, customer, expect, company, know...
2980    [job, delivery, insight, provide, development,...
2983    [legalzoom, make, legal, help, accessible, leg...
2987    [today, world, customer, expect, company, know...
Name: desc, Length: 1588, dtype: object

In [177]:
freq = FreqDist(raw_tokens.explode())

In [178]:
freq.most_common(100)

[('data', 18965),
 ('experience', 8897),
 ('work', 8281),
 ('team', 5466),
 ('business', 5422),
 ('science', 5083),
 ('model', 5007),
 ('learn', 4225),
 ('use', 3595),
 ('machine', 3383),
 ('analysis', 3289),
 ('include', 3264),
 ('year', 3211),
 ('scientist', 3081),
 ('product', 3075),
 ('opportunity', 2884),
 ('skill', 2839),
 ('analytics', 2796),
 ('develop', 2691),
 ('require', 2679),
 ('solution', 2627),
 ('provide', 2562),
 ('job', 2505),
 ('time', 2445),
 ('new', 2399),
 ('support', 2326),
 ('ability', 2293),
 ('information', 2274),
 ('company', 2262),
 ('statistical', 2259),
 ('build', 2247),
 ('project', 2243),
 ('position', 2232),
 ('apply', 2218),
 ('customer', 2214),
 ('problem', 2198),
 ('development', 2197),
 ('make', 2174),
 ('employee', 2157),
 ('process', 2120),
 ('research', 2097),
 ('technology', 2040),
 ('knowledge', 1963),
 ('design', 1939),
 ('status', 1916),
 ('service', 1899),
 ('technical', 1886),
 ('engineering', 1850),
 ('role', 1844),
 ('application', 1813),

In [186]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()

word_finder = nltk.collocations.BigramCollocationFinder.from_words(raw_tokens.explode())
words_scored = word_finder.score_ngrams(bigram_measures.raw_freq)

In [187]:
words_scored

[(('data', 'science'), 0.004827647843968078),
 (('data', 'scientist'), 0.004156612119374032),
 (('machine', 'learn'), 0.00401888863004687),
 (('computer', 'science'), 0.0014988418040604986),
 (('year', 'experience'), 0.0014666086469839288),
 (('equal', 'opportunity'), 0.0012424416909514202),
 (('data', 'analysis'), 0.0012014176728539676),
 (('sexual', 'orientation'), 0.001134021071693867),
 (('national', 'origin'), 0.0010607638965198446),
 (('full', 'time'), 0.0009948324388632244),
 (('gender', 'identity'), 0.0009757855733179787),
 (('opportunity', 'employer'), 0.0009259706941996434),
 (('veteran', 'status'), 0.0008981329676335148),
 (('experience', 'data'), 0.0008820163890952299),
 (('data', 'set'), 0.0008761558150813081),
 (('data', 'analytics'), 0.0007999683529003248),
 (('experience', 'work'), 0.0007970380658933639),
 (('machine', 'learning'), 0.0007662700523202745),
 (('team', 'member'), 0.0007574791912993918),
 (('race', 'color'), 0.0007560140477959114),
 (('communication', 'skil

In [188]:
words_pmi_scored = word_finder.score_ngrams(bigram_measures.pmi)

In [189]:
words_pmi_scored

[(('absinthe', 'anise'), 19.38052659299037),
 (('acquisitiont', 'dnb'), 19.38052659299037),
 (('addressbiogeographic', 'sunrise'), 19.38052659299037),
 (('adptvs', 'localads'), 19.38052659299037),
 (('adptvssci', 'verticalads'), 19.38052659299037),
 (('aep', 'nisource'), 19.38052659299037),
 (('alexander', 'bell'), 19.38052659299037),
 (('algorithmsstatistical', 'packagesdemonstrated'), 19.38052659299037),
 (('alphabeta', 'finsbury'), 19.38052659299037),
 (('analysisfeature', 'engineeringtime'), 19.38052659299037),
 (('analysissqlpython', 'ra'), 19.38052659299037),
 (('analytica', 'votervoice'), 19.38052659299037),
 (('analyticsbatch', 'analyticsbenefitserias'), 19.38052659299037),
 (('anylogistix', 'jda'), 19.38052659299037),
 (('apnea', 'congenital'), 19.38052659299037),
 (('applicationlist', 'referencesoptional'), 19.38052659299037),
 (('applicationscientific', 'knowledgeinterpersonal'), 19.38052659299037),
 (('applyrequired', 'documentsresumecover'), 19.38052659299037),
 (('arcade'

In [204]:
jobs_df[jobs_df['desc'].str.contains('ketamine')]

Unnamed: 0,index,title,company,salary_and_type,desc
258,274,Junior Analyst / Data Scientist - job post,"Heading HealthAustin, TX 78746•Remote","$50,000 - $65,000 a year - Full-time",About Heading Health Heading is on a mission t...


In [203]:
print(jobs_df.iloc[258]['desc'])

About Heading Health Heading is on a mission to accelerate mental wellbeing through connection based care. Our co-founder, Steve Levine MD, has designed an outcomes led approach pairing clinician know-how with a new class of psychotherapy assisted treatments which offer 5-10X better outcomes than second line SSRI's. These already FDA-approved treatments are Spravato, Transcranial Magnetic Stimulation (TMS) and ketamine, with psychedelic assisted therapies (MDMA, psilocybin) likely to be approved over the coming years. Reimagining mental health: mental health care is often inaccessible, unaffordable, and is so often ineffective. At Heading Health, we utilize developments in clinical research, insurance coverage and technologies to deliver the best outcomes for our patients. Our core values are patient outcomes, agency and being different. A mission-driven company: the secret ingredient that makes things move is how we come together. It begins and ends with compassion: compassion for our