In [1]:
import pandas as pd
from tqdm import tqdm
from time import time
from typing import List

from dfply import *
import torch
# from torch.autograd import Variable
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

In [2]:
MAX_EXAMPLES = 10000

In [3]:
vac_skills = pd.read_csv('../../data/skills_coocurrance/seek_vacancies.csv') >> head(MAX_EXAMPLES)
vac_skills.head()

Unnamed: 0,vacancy_id,title
0,1,presentation
1,1,Tax planning
2,1,Xero
3,1,communication
4,2,administrative tasks


In [4]:
res_skills = pd.read_csv('../../data/skills_coocurrance/indeed_resumes.csv') >> head(MAX_EXAMPLES)
res_skills.head()

Unnamed: 0,resume_id,title
0,1,Auto Insurance
1,2,DETAIL-ORIENTED
2,2,MICROSOFT OFFICE
3,3,REAL ESTATE
4,3,ACROBAT


In [5]:
def default_clean(text):
    '''
    Removes default bad characters
    '''
    if not (pd.isnull(text)):
#         # text = filter(lambda x: x in string.printable, text)
#         bad_chars = set(["@", "+", '/', "'", '"', '\\','(',')', '', '\\n', '', '?', '#', ',','.', '[',']', '%', '$', '&', ';', '!', ';', ':',"*", "_", "=", "}", "{"])
#         for char in bad_chars:
#             text = text.replace(char, " ")
#         text = re.sub('\d+', "", text)
        text = text.lower()
        text = re.sub('[^A-Za-z]', ' ', text)
    return text

In [6]:
n_vac_rows = vac_skills.shape[0]
vacancies = vac_skills >> mutate(gr_id = X.vacancy_id) >> select(X.gr_id, X.title)
resumes = res_skills >> mutate(gr_id = X.resume_id + n_vac_rows) >> select(X.gr_id, X.title)

skill_groups = (vacancies >>
     bind_rows(resumes) >> 
     mutate(title = X.title.map(lambda x: str(x).lower()))
    )

skill_groups >> tail()

Unnamed: 0,gr_id,title
9995,13008,"ms office, windows & database software"
9996,13008,retail sales
9997,13008,"branding, sales & marketing"
9998,13008,accounting & finance
9999,13008,csr


In [7]:
#Analysis of the skills frequencies
#freq_skills = skill_groups.groupby('title').aggregate(lambda x: len(x))
#freq_skills = skill_groups >> group_by(X.title) >> summarize(freq = n(X.gr_id)) # >> arrange(desc(X.freq))
#freq_skills = skill_groups >> group_by(X.title) >> summarize(freq = n(X.gr_id)) # >> arrange(desc(X.freq))
#(freq_skills >> mask(X.gr_id > 10) 
#     >> mutate(title_len = X.title.map(lambda t: len(t)))
#     >> mask(len(X.title_len) > 1) 
#     >> head()
# )
df_titles_cnt = pd.DataFrame(
    skill_groups.groupby('title').size(),
    columns=['count']
)
df_titles_cnt.head()
popular_skills  = list(set(df_titles_cnt.query('count >= 10').index))
skills_filter = skill_groups.title.isin(popular_skills)

df_groups_cnt = pd.DataFrame(
    skill_groups.groupby('gr_id').size(),
    columns=['count']
)
active_groups = list(set(df_groups_cnt.query('count >= 3').index))
groups_filter =  skill_groups.gr_id.isin(active_groups)

freq_skill_groups = skill_groups[skills_filter & groups_filter]
freq_skill_groups >> head()


Unnamed: 0,gr_id,title
0,1,presentation
2,1,xero
3,1,communication
7,2,physically fit
9,2,verbal and written


In [8]:
freq_skill_groups.shape

(9496, 2)

In [9]:
df_titles_cnt['title'] = df_titles_cnt.index
df_titles_cnt.reset_index(level=0, drop=True, inplace=True)
df_titles_cnt.head()
popular_skills = df_titles_cnt.query('count > 10')
popular_skills.shape
#freq_skill_groups.shape

(274, 2)

In [10]:
skills_lists_df = freq_skill_groups.groupby('gr_id').aggregate(lambda x: tuple(x))
skills_lists_df.head()

Unnamed: 0_level_0,title
gr_id,Unnamed: 1_level_1
1,"(presentation, xero, communication)"
2,"(physically fit, verbal and written)"
5,"(physically fit,)"
6,"(team player,)"
8,"(communication, welding)"


In [11]:
%time skills_multilist = skills_lists_df['title'].values

CPU times: user 181 µs, sys: 40 µs, total: 221 µs
Wall time: 218 µs


In [12]:
# max_skills = 0
# for skills in texts_multilist:
#     max_skills = max(len(skills), max_skills)
# print(max_skills)
@make_symbolic
def vlen(xv):
    return xv.map(lambda x: len(x))

(skills_lists_df >> mutate(llen = vlen(X.title)) >> head()
    >>  summarize(llm = mean(X.llen))
) >> head()


Unnamed: 0,llm
0,1.8


In [13]:
vocabulary = []
for sentence in skills_multilist:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

# print(vocabulary)

In [14]:
skill_group_lookup = {v: i+1 for i, v in enumerate(freq_skill_groups['gr_id'].unique())}
skill_lookup = {v: i+1 for i, v in enumerate(freq_skill_groups['title'].unique())}

In [15]:
from torch.utils.data import Dataset, DataLoader

class SkillGroupToSkillLevelDataset(Dataset):
    def __init__(self, df: pd.DataFrame, skill_lookup:dict, skill_group_lookup:dict):
        self.df = df
        self.skill_lookup = skill_lookup
        self.skill_group_lookup = skill_group_lookup

    def __getitem__(self, index):
        row = self.df.iloc[index]
        skill_group_id = self.skill_group_lookup[row.gr_id]
        skill_id = self.skill_lookup[row.title]
        
        #as we do not have means to measure the rating let's it be 1
        rating = torch.tensor(1.0, dtype=torch.float32)
        #input is tuple (skill_group_id, skill if) value is rating 1 
        return (skill_group_id, skill_id), rating

    def __len__(self):
        return len(self.df)

In [41]:
from sklearn.model_selection import train_test_split

def get_rand_n_skills_by_skill_group(
    df, n_samples, min_skills_in_sk_gr=1, skill_group_colname="gr_id"
):
    return (
        df.groupby(skill_group_colname)
        .filter(lambda x: len(x) >= min_skills_in_sk_gr)
        .groupby(skill_group_colname)
        .sample(n = n_samples, replace=True)
        .sort_values(skill_group_colname)
    )


# tdf = get_rand_n_skills_by_skill_group(freq_skill_groups, 1)
# tdf.head()


def mark_rand_n_ratings_as_validation_set(
    df, n, min_skills_in_sk_gr=1, skill_group_colname="gr_id"
):

    df["is_valid"] = False
    
    df.loc[
        get_rand_n_skills_by_skill_group(
            df,
            n,
            min_skills_in_sk_gr,
            skill_group_colname=skill_group_colname
        ).index,
        "is_valid",
    ] = True
    return df

freq_skill_groups = mark_rand_n_ratings_as_validation_set(freq_skill_groups, 1)

freq_skill_groups_train = freq_skill_groups[freq_skill_groups.is_valid==False]
freq_skill_groups_valid = freq_skill_groups[freq_skill_groups.is_valid==True]

train_ds = SkillGroupToSkillLevelDataset(freq_skill_groups_train, skill_lookup, skill_group_lookup)
valid_ds = SkillGroupToSkillLevelDataset(freq_skill_groups_valid, skill_lookup, skill_group_lookup)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["is_valid"] = False


In [42]:
import torch
from torch import nn

class MfDotBias(nn.Module):

    def __init__(
        self, n_factors, n_skill_groups, n_skills, ratings_range=None, use_biases=True
    ):
        super().__init__()
        self.bias = use_biases
        self.y_range = ratings_range
        self.user_embedding = nn.Embedding(n_skill_groups+1, n_factors, padding_idx=0)
        self.item_embedding = nn.Embedding(n_skills+1, n_factors, padding_idx=0)

        if use_biases:
            self.user_bias = nn.Embedding(n_skill_groups+1, 1, padding_idx=0)
            self.item_bias = nn.Embedding(n_skills+1, 1, padding_idx=0)

    def forward(self, inputs):
        sk_groups, skills = inputs
        dot = self.user_embedding(sk_groups) * self.item_embedding(skills)
        result = dot.sum(1)
        if self.bias:
            result = (
                result + self.user_bias(sk_groups).squeeze() + self.item_bias(skills).squeeze()
            )

        if self.y_range is None:
            return result
        else:
            return (
                torch.sigmoid(result) * (self.y_range[1] - self.y_range[0])
                + self.y_range[0]
            )


In [43]:
model = MfDotBias(128, len(skill_group_lookup), len(skill_lookup), ratings_range=(0,1))
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 353,976 trainable parameters


In [44]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())
BATCH_SIZE = 128

criterion = nn.BCELoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device %s" %device)

model = model.to(device)
criterion = criterion.to(device)

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE)

Using device cuda


In [45]:
def train_one_epoch(model: MfDotBias, epoch_index: int, train_loader: DataLoader, loss_fn: nn.BCELoss, device: torch.device, 
    optimizer: optim.Optimizer, tb_writer: SummaryWriter):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in tqdm(enumerate(train_loader)):
        # Every data instance is an input + label pair
        inputs, labels = data
        (sk_gr, sk) = inputs
        inputs_d = (sk_gr.to(device), sk.to(device))
        labels_d= labels.to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs_d)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels_d)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [46]:
#device
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
tb_summary_writer = SummaryWriter('../../tensorboard/runs/matrix_decomposition_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(model, epoch_number, train_dl, criterion, device, optimizer, tb_summary_writer)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(valid_dl):
        (val_sk_gr, val_sk), vlabels = vdata
        vinputs_d = (val_sk_gr.to(device), val_sk.to(device))
        vlabels_d = vlabels.to(device)
        voutputs = model(vinputs_d)
        vloss = criterion(voutputs, vlabels_d)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    tb_summary_writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    tb_summary_writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        # model_path = '../../models/skills_adj_matrix/model_{}_{}'.format(timestamp, epoch_number)
        model_path = '../../models/skills_adj_matrix/model_best'
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:


49it [00:00, 62.08it/s]


LOSS train 0.0 valid 4.622587203979492
EPOCH 2:


49it [00:00, 62.55it/s]


LOSS train 0.0 valid 4.596617698669434
EPOCH 3:


49it [00:00, 64.02it/s]


LOSS train 0.0 valid 4.5717926025390625
EPOCH 4:


49it [00:00, 67.47it/s]


LOSS train 0.0 valid 4.548688888549805
EPOCH 5:


49it [00:00, 60.95it/s]


LOSS train 0.0 valid 4.527141094207764
