In [73]:
import nltk
from nltk.corpus import stopwords
import analysis as an
import pandas as pd
import numpy as np
import plotly.express as px
import string
from itertools import product

#### Write corpus directory from Feather file 

In [2]:
feather_filepath = '../scraper/data/uk_example/vacancy_descriptions.feather'
an.write_corpus(feather_filepath, corpus_id='uk')

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'uk'

#### Read corpus as Reader obj

In [3]:
reader = an.read_corpus('uk')

#### Use the Reader to compute statistics for the corpus

In [4]:
len(reader.fileids()) # number of vacancy descriptions

14842

In [5]:
len(reader.raw())/1e6 # units millions of characters

35.827121

In [6]:
len(reader.words())/1e6 # units millions of words

6.279366

In [83]:
cfd = nltk.ConditionalFreqDist(
        (fileid, word) 
        for fileid in reader.fileids()
        for word in reader.words(fileids=[fileid])
)

#### Remove stopwords and punctuation

In [47]:
STOPWORDS_SET = set(stopwords.words('english'))
PUNCTUATION_SET = set(v for v in string.punctuation)
STOPWORDS_PUNCTUATION_SET = PUNCTUATION_SET.union(STOPWORDS_SET)

def remove_stopwords_and_punctuation(tokens: list):
    return([t for t in tokens if t not in STOPWORDS_PUNCTUATION_SET])

corpus_words = {
    fileid: remove_stopwords_and_punctuation(reader.words(fileids=[fileid]))
    for fileid in reader.fileids()
}

corpus_types = {
    k: set(v) for k, v in corpus_words.items()
}

In [48]:
def keyword_indicence(word: str, corpus_types: dict):
    return np.mean([word in corpus_types[fileid]
                    for fileid in corpus_types.keys()])

def keyword_incidence_df(keywords: list, corpus_types: dict):
    df_fracs = pd.DataFrame([(kw, keyword_indicence(kw, corpus_types))
                              for kw in keywords], columns=['keyword', 'incidence'])
    return df_fracs

In [49]:
fds = {
    fileid: nltk.FreqDist(reader.words(fileids=[fileid])) for fileid in reader.fileids()
}

In [50]:
keywords = ['analyst', 'data', 'maths', 'nurse', 'doctor', 'python', 'health']
df_fracs = keyword_incidence_df(keywords, corpus_types)
df_fracs.head()

Unnamed: 0,keyword,incidence
0,analyst,0.007479
1,data,0.069398
2,maths,0.012869
3,nurse,0.341935
4,doctor,0.026883


In [51]:
px.bar(data_frame=df_fracs, x='incidence', y='keyword')

In [71]:
# compute term frequency for each file
word_fds = {fileid: nltk.FreqDist(corpus_words[fileid])
             for fileid in corpus_words.keys()}

# compute number of documents
n_files = len(reader.fileids())

In [32]:
lexicon = set.union(*corpus_types.values())

In [66]:
from tqdm import tqdm

In [67]:
# compute number of files containing terms
files_per_word = {}
for word in tqdm(lexicon):
    files_per_word[word] = sum([(word in corpus_types[fileid]) 
                                for fileid in corpus_types.keys()])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 55710/55710 [07:28<00:00, 124.35it/s]


In [None]:
word_fds[word_fds.keys()[0]]

In [77]:
def tf_idf(word_fds, files_per_word, n_files):
    # is computed for each file and term
    # want a dataframe with each row corresponding to a fileid
    # each column corresponding to a word
    df = pd.DataFrame(0., index=word_fds.keys(), columns=files_per_word.keys())
    for fileid in tqdm(word_fds.keys()):
        for word in word_fds[fileid]:
            df.loc[fileid, word] = word_fds[fileid][word]/(1. + files_per_word[word])
    df = df*n
    # speed-up idea: dodge files that don't contain word
    return(df)

In [None]:
df_tf_idf = tf_idf(word_fds=word_fds,
                   files_per_word=files_per_word,
                   n_files=n_files)

In [37]:
# nltk's TextCollection has a built-in tf/idf tool, but it doesn't solve our problem