In [13]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from flair.data import Sentence
from flair.embeddings import TransformerDocumentEmbeddings, WordEmbeddings, DocumentPoolEmbeddings
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore') #don't want warnings in my notebook output on github
tqdm.pandas(desc="tqdm bar!")

In [4]:
def spacy_tokenizer(text):
    tokens = nlp(text)
    # Lemmatizing each token, converting each token into lowercase and removing puncutation and named entities
    tokens = [ word.lemma_.lower().strip() for word in tokens if not word.is_punct and not word.ent_type_ ]
    # Removing stop words
    tokens = [ word for word in tokens if word not in STOP_WORDS ]
    return " ".join(tokens)

def one_hot(x, num_classes):
    one_hot_vec = np.zeros(num_classes, dtype=float)
    one_hot_vec[x] = 1
    return one_hot_vec

def embed_document(document, document_embeddings):
    document = Sentence(document)
    document_embeddings.embed(document)
    return document.embedding

In [5]:
class JobDescDataset(Dataset):
    def __init__(self, df, vec_col="text_vecs"):
        self.df = df
        self.vec_col = vec_col
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        X = self.df[self.vec_col].iloc[idx]
        y = self.df.label_one_hot.iloc[idx]
        return X, torch.from_numpy(y)



class Net(nn.Module):
    def __init__(self, in_features:int, n_classes:int) -> None:
        super(Net, self).__init__()

        self.fc1 = nn.Linear(in_features, int(in_features))
        self.fc2 = nn.Linear(in_features, int(in_features/4))
        self.fc3 = nn.Linear(int(in_features/4), n_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [11]:
# load data
csv_file = 'data\document_type_data.csv'
df = pd.read_csv(csv_file, index_col=[0])
df.text = df.text.map(lambda x: " ".join(eval(x)))

# pre-processing
nlp = spacy.load('en_core_web_sm')   
print("spacy pre-processing")
df["text_pp"] = df.text.progress_apply(spacy_tokenizer)

to_predict = df[df.isnull().any(1)]
df = df.dropna()
# level to int and then one-hot vectors

labels = df.label.unique()
labels_dict = dict(zip(labels, range(len(labels))))
df["class_name"] = df.label
df["label"] = df.class_name.map(lambda x: labels_dict[x])
df["label_one_hot"] = df.label.map(lambda x: one_hot(x, num_classes=len(labels)))

train, test = train_test_split(df.dropna(), random_state=42, stratify=df.label)

spacy pre-processing


tqdm bar!: 100%|██████████| 100/100 [00:08<00:00, 11.73it/s]


### I will refrain from using the Longformer embeddings here since loading the model frequently exceeds the RAM capacity of my computer.

In [14]:
# get text embeddings
embedder = "glove"

# init embedding
if embedder == "longformer":
    document_embeddings = TransformerDocumentEmbeddings('allenai/longformer-base-4096')
elif embedder == "glove":
    # glove pooling = mean
    glove_embedding = WordEmbeddings('glove')
    document_embeddings = DocumentPoolEmbeddings([glove_embedding])
else:
    raise ValueError("embedder has to be either longformer or glove")

train["text_vecs"] = train["text"].progress_apply(lambda x: embed_document(x, document_embeddings))
test["text_vecs"] = test["text"].progress_apply(lambda x: embed_document(x, document_embeddings))

tqdm bar!: 100%|██████████| 75/75 [00:00<00:00, 103.10it/s]
tqdm bar!: 100%|██████████| 25/25 [00:00<00:00, 93.81it/s]


In [31]:
# set up training 
train_dataset = JobDescDataset(train)
test_dataset = JobDescDataset(test)
to_predict_dataset = JobDescDataset(to_predict)

learning_rate = 0.01
batch_size = 20
epochs = 20

training_generator = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_generator = DataLoader(test_dataset, batch_size = batch_size, shuffle = True)
predict_generator = DataLoader(to_predict_dataset, batch_size = batch_size, shuffle = False)

net = Net(in_features=train.text_vecs[0].shape[0], n_classes=len(train.label.unique()))
print(net)

optimizer = optim.Adam(net.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

Net(
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=25, bias=True)
  (fc3): Linear(in_features=25, out_features=4, bias=True)
)


In [32]:
# train loop
net.train()
for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(training_generator):
        inputs, labels = data
        inputs, labels = (inputs).type(torch.FloatTensor), (labels)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    if (epoch+1) % 2 == 0 or epoch == 0:
        total_predicted = []
        total_labels = []

        # get f1 score on test data
        with torch.no_grad():
            for data in test_generator:
                inputs, labels = data
                inputs = (inputs).type(torch.FloatTensor)
                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total_predicted.append(predicted.cpu().detach().numpy())

                _, labels = torch.max(labels, 1)
                total_labels.append(labels.cpu().detach().numpy())

        total_predicted = np.hstack(total_predicted)
        total_labels = np.hstack(total_labels)
        f1 = f1_score(total_labels, total_predicted, average='micro')
        
        print(f'----------Epoch {epoch+1} Complete---------')
        print(f"loss: {running_loss/batch_size:.10f}")
        print(f"f1: {f1:.2f}")

----------Epoch 1 Complete---------
loss: 0.2778021210
f1: 0.60
----------Epoch 2 Complete---------
loss: 0.2525592533
f1: 0.84
----------Epoch 4 Complete---------
loss: 0.1369852191
f1: 0.72
----------Epoch 6 Complete---------
loss: 0.0779305924
f1: 0.72
----------Epoch 8 Complete---------
loss: 0.0539636415
f1: 0.88
----------Epoch 10 Complete---------
loss: 0.0387870896
f1: 0.84
----------Epoch 12 Complete---------
loss: 0.0329400704
f1: 0.84
----------Epoch 14 Complete---------
loss: 0.0263550883
f1: 0.80
----------Epoch 16 Complete---------
loss: 0.0216115735
f1: 0.88
----------Epoch 18 Complete---------
loss: 0.0172850475
f1: 0.88
----------Epoch 20 Complete---------
loss: 0.0133499934
f1: 0.92
