In [12]:
import pandas as pd
import unicodedata
import ftfy
import gensim
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
from gensim.corpora import Dictionary

In [3]:
df = pd.read_csv ('./data/seek_australia.csv')
df = df.sample(frac = 0.1, random_state=42)
df

Unnamed: 0,category,city,company_name,geo,job_board,job_description,job_title,job_type,post_date,salary_offered,state,url
2308,Consulting & Strategy,Brisbane,RACQ,AU,seek,Â Lead and develop team of Pricing Analysts ...,Manager Pricing,Full Time,2018-04-16T02:48:56Z,,Southern Suburbs & Logan,https://www.seek.com.au/job/35992845
22404,Trades & Services,Adelaide,Extrastaff,AU,seek,"Backed by the Australian Chamber movement,Â E...",Experienced Bakers,Casual/Vacation,2018-04-12T02:23:53Z,,,https://www.seek.com.au/job/35967419
23397,Accounting,Brisbane,Boeing Defence Australia,AU,seek,"Boeing Defence Australia Ltd, a wholly owned ...",Contracts Manager,Full Time,2018-04-12T03:45:02Z,,CBD & Inner Suburbs,https://www.seek.com.au/job/35968503
25058,Hospitality & Tourism,Gold Coast,Jora Local,AU,seek,A restaurant in Runaway Bay is seeking a casu...,Casual Cook in Runaway Bay,Casual/Vacation,2018-04-12T07:35:02Z,,,https://www.seek.com.au/job/35972652
2664,Community Services & Development,Adelaide,Advanced Personnel Management,AU,seek,About the Company: APM is an Australian owned...,Employment Consultant - Disability Employment ...,Full Time,2018-04-16T03:48:14Z,Competitive Salary + Performance Incentives,,https://www.seek.com.au/job/35993764
...,...,...,...,...,...,...,...,...,...,...,...,...
21481,Banking & Financial Services,Brisbane,Davidson Corporate,AU,seek,About the Company Our client has a track reco...,Desk Assistant,Full Time,2018-04-12T03:31:03Z,,CBD & Inner Suburbs,https://www.seek.com.au/job/35968345
26178,Information & Communication Technology,Melbourne,Finite IT Recruitment Solutions,AU,seek,Data Centre Support Engineer - 6 month contra...,Data Centre Support Engineer,Contract/Temp,2018-04-12T23:34:30Z,,CBD & Inner Suburbs,https://www.seek.com.au/job/35975087
27445,Hospitality & Tourism,"Newcastle, Maitland & Hunter",Muswellbrook Motors,AU,seek,New business freshly renovated On-Par Steak H...,Chef or 4th Year Apprentice,Full Time,2018-04-12T23:08:02Z,,,https://www.seek.com.au/job/35974716
1723,Trades & Services,Cairns & Far North,Downer EDI Limited,AU,seek,Downer's construction business has recently b...,Trades and Non Trades - Far North Queensland,Full Time,2018-04-16T01:41:53Z,,,https://www.seek.com.au/job/35991886


# Data pre-processing

In [50]:
features_to_delete = ['job_board','geo','url', 'city', 'post_date', 'salary_offered', 'state'] # get rid of unnecessary features (maybe add more columns)
df = df.drop(features_to_delete, axis=1)

KeyError: "['job_board', 'geo', 'url'] not found in axis"

Job description is necessary for all the tasks in this assignment, so it makes sense to drop rows that are missing these values

In [5]:
print(df['job_description'].isna().sum())
df.dropna(subset=['job_description'], inplace=True)

37


Tokenize each row and decode the text

In [6]:
for i in df[ "job_description"].index:
    df.loc[i, "job_description"] = gensim.utils.simple_preprocess(
        ftfy.fix_text(unicodedata.normalize("NFKD",df.loc[i,"job_description"])), deacc=True)

### Word2vec

Train a word2vec model on the tokenized data

In [None]:
from time import time
from gensim.models import Word2Vec

cores = 16
model = Word2Vec(min_count=1,
                     window=2,
                     vector_size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

t = time()

model.build_vocab(df['job_description'], progress_per=10)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Define a function that can create an interactive plot with tsne

In [30]:
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, LabelSet

def interactive_tsne(text_labels, tsne_array):
    '''makes an interactive scatter plot with text labels for each point'''

    # Define a dataframe to be used by bokeh context
    bokeh_df = pd.DataFrame(tsne_array, text_labels, columns=['x','y'])
    bokeh_df['text_labels'] = bokeh_df.index

    # interactive controls to include to the plot
    TOOLS="hover, zoom_in, zoom_out, box_zoom, undo, redo, reset, box_select"

    p = figure(tools=TOOLS, plot_width=700, plot_height=700)

    # define data source for the plot
    source = ColumnDataSource(bokeh_df)

    # scatter plot
    p.scatter('x', 'y', source=source, fill_alpha=0.6,
              fill_color="#8724B5",
              line_color=None)

    # text labels
    labels = LabelSet(x='x', y='y', text='text_labels', y_offset=8,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')

    p.add_layout(labels)

    # show plot inline
    output_notebook()
    show(p)

Visualize the vocabulary using the tsne function previously defined

In [49]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

dictionary = Dictionary()
BoW_corpus = dictionary.from_documents(df['job_description'])

vocab = set([item[1] for item in BoW_corpus.items()])

input_vocab =  [word for word in vocab if word in model.wv.key_to_index.keys() and word not in stopwords]
points = len(input_vocab)
X = model.wv[input_vocab]
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(X[:points])
interactive_tsne(list(input_vocab)[:points], X_tsne)



### Finding the top 10 words by tfidf score in each document and treating each category as its own corpus

Create_tfidf function creates a dataframe with each row representing a document with corresponding tfidf scores and the columns showing the full corpus vocabulary. Thus the tfidf score for each word in each document can be looked up.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Allows passing tokens to the vectorizer
def dummy_fun(doc):
    return doc

def create_tfidf_df(tokens):  
    tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

    tdf = tfidf.fit_transform(tokens)
    bow = pd.DataFrame(tdf.toarray(), columns = tfidf.get_feature_names_out())

    return bow

Top x tfidf accepts a list of tokenized documents and removes all words that do not have the top x tfidf scores

Removes all words that do not have the top 10 tfidf scores. Treats each category as its own corpus

In [None]:
df['short_description'] = [None for i in range(len(df))]
def top_x_tfidf(df, x):
    for category in df.category.unique():
        tfidf_df = create_tfidf_df(df[df.category == category]['job_description'])
        for i, (idx_row) in enumerate(df[df.category == category].iterrows()):
            idx, row = idx_row
            desc = row['job_description']
            words = set(desc)        
            if len(words) < x: 
                df.loc[idx]['short_description'] = desc
                continue

            # Create a dictionary of words and their tfidf values and then sort them in descending order
            tfidf_words = {word: tfidf_df.iloc[i][word] for word in words}
            tfidf_words = {k: v for k, v in sorted(tfidf_words.items(), key=lambda item: -item[1])}

            for word in desc:
                if word not in list(tfidf_words.keys())[:x]: # Remove all words who are not in the top ten tfidf scores
                    desc = list(filter(lambda x: x != word, desc))
            df.loc[idx]['short_description'] = desc
    return df['short_description']


In [None]:
top_x_tfidf(df, 10)

Split the dataframe into training, testing and validation

In [None]:
train, test_val = train_test_split(df, test_size=0.3)

test, val = train_test_split(test_val, test_size = float(1/3))

# Task 1

Derive target variable

In [None]:
df['job_type_target'] = df['job_type']

In [None]:
df.loc[df['job_type_target'] != 'Full Time', 'job_type_target'] = 'Other'

Analyse class distribution

In [None]:
df['job_type_target'].hist()

In [None]:
# Class 'Full Time' is twice as large as 'Other'. Verify based on literature if such a distribution is too
# imbalanced for this learning task 

# Task 2