In [139]:
import nltk
from nltk.corpus import stopwords
import analysis as an
import pandas as pd
import numpy as np
import plotly.express as px

#### Write corpus directory from Feather file 

In [65]:
feather_filepath = '../scraper/data/uk_example/vacancy_descriptions.feather'
an.write_corpus(feather_filepath, corpus_id='uk')

#### Read corpus as Reader obj

In [67]:
reader = an.read_corpus('uk')

#### Use the Reader

In [71]:
len(reader.fileids()) # number of vacancy descriptions

14842

In [78]:
len(reader.raw())/1e6 # units millions of characters

35.827121

In [80]:
len(reader.words())/1e6 # units millions of words

6.279366

In [83]:
cfd = nltk.ConditionalFreqDist(
        (fileid, word) 
        for fileid in reader.fileids()
        for word in reader.words(fileids=[fileid])
)

In [125]:
STOPWORDS_SET = set(stopwords.words('english'))
PUNCTUATION_SET = set(v for v in string.punctuation)
STOPWORDS_PUNCTUATION_SET = PUNCTUATION_SET.union(STOPWORDS_SET)

def remove_stopwords_and_punctuation(tokens: list):
    return([t for t in tokens if t not in PUNCTUATION_SET])

corp_nostop = {
    fileid: remove_stopwords_and_punctuation(reader.words(fileids=[fileid]))
    for fileid in reader.fileids()
}

In [173]:
import numpy as np
import plotly.express as px

def frac_of_texts_containing_word(word: str, fds):
    return np.mean([fds[fileid][word] > 0
                    for fileid in reader.fileids()])
    
def plot_frac_of_texts_with_kws(keywords: list, fds):
    df_fracs = pd.DataFrame([(kw, frac_of_texts_containing_word(kw, fds))
                             for kw in keywords], columns=['keyword', 'incidence'])
    df_fracs = df_fracs.sort_values(by='incidence', ascending=True)
    
    fig = px.bar(df_fracs, x='incidence', y='keyword')
    fig.show()
    
    return df_fracs

In [174]:
fds = {
    fileid: nltk.FreqDist(reader.words(fileids=[fileid])) for fileid in reader.fileids()
}

In [None]:
# specify keywords and plot frac of descriptions featuring word
keywords = ['analyst', 'data', 'maths']
df_tmp = plot_frac_of_texts_with_kws(keywords, fds=fds)