In [None]:
import pandas as pd
import unicodedata
import ftfy
import gensim
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv ('./data/seek_australia.csv')
df

# Data pre-processing

In [None]:
features_to_delete = ['job_board','geo','url'] # get rid of unnecessary features
df = df.drop(features_to_delete, axis=1)

Job description is necessary for all the tasks in this assignment, so it makes sense to drop rows that are missing these values

In [None]:
print(df['job_description'].isna().sum())
df.dropna(subset=['job_description'], inplace=True)

Tokenize each row and decode the text to remove all the non-latin characters

In [None]:
for i in df[ "job_description"].index:
    df.loc[i, "job_description"] = gensim.utils.simple_preprocess(
        ftfy.fix_text(unicodedata.normalize("NFKD",df.loc[i,"job_description"])), deacc=True)

Create_tfidf function creates a dataframe with each row representing a document with corresponding tfidf scores and the columns showing the full corpus vocabulary. Thus the tfidf score for each word in each document can be looked up.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Allows passing tokens to the vectorizer
def dummy_fun(doc):
    return doc

def create_tfidf_df(tokens):  
    tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

    tdf = tfidf.fit_transform(tokens)
    bow = pd.DataFrame(tdf.toarray(), columns = tfidf.get_feature_names_out())

    return bow

In [None]:
tfidf_bow = create_tfidf_df(df['job_description'])

Top x tfidf accepts a list of tokenized documents and removes all words that do not have the top x tfidf scores

In [None]:
def top_x_tfidf(descriptions, tfidf_df, x):
    new_descriptions = []
    for idx, desc in enumerate(descriptions):
        words = set(desc)        
        if len(words) < x: 
            new_descriptions.append(desc)
            continue

        # Create a dictionary of words and their tfidf values and then sort them in descending order
        tfidf_words = {word: tfidf_df.iloc[idx][word] for word in words}
        tfidf_words = {k: v for k, v in sorted(tfidf_words.items(), key=lambda item: -item[1])}

        for word in desc:
            if word not in list(tfidf_words.keys())[:x]: # Remove all words who are not in the top ten tfidf scores
                desc = list(filter(lambda x: x != word, desc))
        new_descriptions.append(desc)
    return new_descriptions


In [None]:
df['short_description'] = top_x_tfidf(df['job_description'], tfidf_bow, 10)
df['short_description']

Split the dataframe into training, testing and validation

In [None]:
train, test_val = train_test_split(df, test_size=0.3)

test, val = train_test_split(test_val, test_size = float(1/3))

# Task 1

Derive target variable

In [None]:
df['job_type_target'] = df['job_type']

In [None]:
df.loc[df['job_type_target'] != 'Full Time', 'job_type_target'] = 'Other'

Analyse class distribution

In [None]:
df['job_type_target'].hist()

In [None]:
# Class 'Full Time' is twice as large as 'Other'. Verify based on literature if such a distribution is too
# imbalanced for this learning task 

# Task 2