In [1]:
import numpy as np
import pandas as pd
from string import punctuation
from collections import Counter
import torch
from torch.utils.data import TensorDataset, DataLoader

## Load Data Set

In [2]:
path = '..\data'
df = pd.read_parquet(path+'\\data_scientist_merged_01_09_2019.parquet')

In [3]:
df.head(2)

Unnamed: 0,closing,company,contract_time,contract_type,description,disability_confident,id,isco_code,location,postcode,posted,salary,salary_base,salary_info,title,url
0,2019-05-12T15:02:30,Hays Specialist Recruitment,full_time,permanent,"Data Scientist - Python, Machine Learning, Clo...",,2038758,2,"Bromley, Kent, BR11DP",BR11DP,2019-04-12T15:02:30,"£55,000.00 to £65,000.00 per year",,,Data Scientist,https://findajob.dwp.gov.uk/details/2038758?ut...
1,2019-05-10T10:42:14,Technojobs,full_time,permanent,"Data Scientist Salary £55,788 to £59,297 (depe...",,2016089,2,London,,2019-04-10T10:42:14,,,,Data Scientist,https://findajob.dwp.gov.uk/details/2016089?ut...


In [4]:
df.iloc[2].description

"Data Scientists Required - Machine Learning - Bristol - 60k Your new company I am working with an exciting organisation; a leading provider of scientific instrumentation for the measurement of elemental concentrations, crystallographic structure, molecular structure and more. They are looking to bring on a dynamic team of Data Scientists to fill their new Bristol office. Your new role As a Data Scientist you will provide technical expertise in the discovery of information buried within large amounts of data and to build machine learning models trained on example data sets that can solve advanced classification and regression problems. This is to facilitate the delivery of next generation products and services, aligned with the company's vision of moving towards predictive and prescriptive solutions which deliver significant new value to our customers. What you'll need to succeed The successful candidate will have a degree level or equivalent experience in Mathematics or Computer Scien

## Data Prep

In [5]:
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


##### flatten description field

In [6]:
corpora = ''
for key,text in df.description.items():
    corpora+='\n'+text

###### remove punct from flattened text

In [7]:
corpora = corpora.lower() # lowercase, standardize
corpora = [letter for letter in corpora if letter not in punctuation] 
all_text = ''.join(corpora)

In [8]:
all_text[0:1000]

'\ndata scientist  python machine learning cloud what is the role this is a data scientist position for a leading insurance company based in bromley theyre looking for their next data scientist to work alongside the head of analytics  data science to help drive machine learning forwards across the wider business you will be working to figure out which algorithms are the most robust and scalable for each situation to ensure that this leading brand are making insurance easier and better value for their customers you will be applying groundbreaking technology to their business problems  always staying up to date with machine learning technologies what do you need to succeed handson python experience building productionised data science pipelines experience using open source machine learning packages and frameworks scikitlearn tensorflow experience using cloud based tech aws gcp experience working across internal or external consulting to solve business challenges experience shaping best p

#### count words

In [9]:
words = all_text.split() 

In [10]:
len(words)

1437775

In [11]:
words[0]

'data'

In [12]:
word_counts_dict = Counter(words) # get word counts
word_counts = {k: v for k, v in sorted(word_counts_dict.items(), reverse=True,key=lambda item: item[1])} # sort by counts

In [13]:
keys = list(word_counts.keys())
keys[0:10]

['and', 'the', 'to', 'of', 'a', 'in', 'data', 'with', 'for', 'will']

#### Convert words to int

In [14]:
#word to int dictionary
word_to_int = {}
int_ = 1
for key,value in word_counts.items():
    word_to_int[key] = int_
    int_+=1

In [15]:
descriptions = all_text.split('\n')
descriptions = descriptions[1:] # first one is an error
print(len(descriptions))
print(len(df.description))

2934
2934


In [16]:
# convert words  into numbers
descs_ints = []
for desc in descriptions:
    current_int =[]
    words = desc.split()
    for word in words:
        try:
            current_int.append(word_to_int[word])
        except Exception as e:
            #review_int.append(0) # replace the removed words with a 0
            print(e)
    descs_ints.append(current_int)

In [17]:
# print tokens in first description
print('Tokenized review: \n', descs_ints[:1])

Tokenized review: 
 [[7, 175, 96, 60, 39, 234, 81, 17, 2, 37, 25, 17, 5, 7, 175, 183, 9, 5, 132, 743, 69, 110, 6, 10968, 3735, 89, 9, 46, 379, 7, 175, 3, 24, 583, 2, 612, 4, 54, 7, 26, 3, 71, 177, 60, 39, 4542, 48, 2, 411, 29, 11, 10, 14, 32, 3, 6457, 200, 101, 325, 21, 2, 225, 725, 1, 775, 9, 697, 4689, 3, 122, 31, 25, 132, 994, 21, 271, 743, 2777, 1, 308, 218, 9, 46, 171, 11, 10, 14, 393, 2351, 68, 3, 46, 29, 182, 954, 4082, 180, 3, 74, 8, 60, 39, 125, 81, 112, 11, 115, 3, 719, 603, 96, 16, 142, 6132, 7, 26, 461, 16, 61, 420, 628, 60, 39, 713, 1, 568, 1492, 1513, 16, 61, 234, 110, 599, 419, 1933, 16, 32, 48, 248, 19, 264, 811, 3, 474, 29, 363, 16, 1977, 170, 4543, 6, 7, 26, 9, 5, 5000, 4690, 29, 19, 911, 81, 10, 11, 293, 6, 518, 25, 17, 22, 1545, 1014, 3, 130, 5, 132, 1780, 994, 6, 5, 2624, 1035, 373, 4, 2, 29, 11, 10, 14, 34, 2, 383, 4, 5, 23, 45, 2, 29, 79, 134, 72, 5, 1878, 193, 20, 619, 25, 69, 4542, 6, 68, 11, 10, 293, 2, 1014, 3, 24, 583, 93, 143, 51, 30, 288, 57, 33, 776, 32, 

##### remove zero lenght ones

In [18]:
# only check for zero lenght reviews at this point
desc_lengths = np.array([len(desc) for desc in descs_ints])
zero_index = np.argwhere(desc_lengths==0)
zero_index

array([], shape=(0, 1), dtype=int64)

In [19]:
np.mean(desc_lengths),np.median(desc_lengths),np.max(desc_lengths),np.min(desc_lengths)

(490.03919563735514, 492.0, 1804, 54)

##### pad and truncate

In [20]:
def pad_trun(descs, seq_length):
    
    features = np.zeros((len(descs),seq_length),dtype=int)
    for j, desc in enumerate(descs):
        features[j,-len(desc):] = desc[:seq_length]
    return features

In [21]:
seq_length = 450

features = pad_trun(descs_ints, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(descs_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."
# print first 10 values of the first 10 batches 
print(features[:10,:10])

[[   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    0    0]
 [ 130   88    6 1357   15    5    7  175 5612   37]
 [   2   29  130   88    6 1357   15    5    7  175]
 [   2   29  130   88    6 1357   15    5    7  175]
 [   2   29  130   88    6 1357   15    5    7  175]]


##### split train validation sets

In [22]:
train_size_frac = 0.9
train_size = int(train_size_frac*len(features))
train_x, valid_x = features[:train_size],features[train_size:]
train_y = np.ones(train_x.shape[0])
valid_y = np.ones(valid_x.shape[0])

In [23]:
train_x.shape,valid_x.shape,valid_y.shape,train_y.shape

((2640, 450), (294, 450), (294,), (2640,))

#### Convert to tensor

In [24]:
train_data = TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(valid_x),torch.from_numpy(valid_y))

In [25]:
# dataloaders
batch_size = 50

# shuffle sets 
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [26]:
dataiter = iter(train_loader)
sample_x = dataiter.next()
print(sample_x[0].shape)

torch.Size([50, 450])


#### Model

In [67]:
import torch.nn as nn
import torch.nn.functional as F

# define the NN architecture
class Autoencoder(nn.Module):
    def __init__(self, input_size,embedding_dim,vocab_size,encoding_dim):
        super(Autoencoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        ## encoder ##
        
        self.fc1 = nn.Linear(embedding_dim, encoding_dim)
        
        ## decoder ##
        
        self.fc2 = nn.Linear(encoding_dim, embedding_dim)
        self.fc3 = nn.Linear(embedding_dim,input_size)
        

    def forward(self, x):
        x=x.long()
        # add layer, with relu activation function
        x = self.embedding(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # output layer (sigmoid for scaling from 0 to 1)
        x = F.sigmoid(self.fc3(x))
        return x

# initialize the NN
input_dim = 450
embedding_dim = 100
encoding_dim = 32
vocab_size = len(word_to_int)+1 # +1 for the 0 padding + our word tokens
model = Autoencoder(input_dim,embedding_dim,vocab_size,encoding_dim)
print(model)


Autoencoder(
  (embedding): Embedding(25262, 100)
  (fc1): Linear(in_features=100, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=450, bias=True)
)


In [68]:
# specify loss function
criterion = nn.MSELoss()

# specify loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [73]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
sample_x
y = model(sample_x)
y.shape

torch.Size([50, 450, 450])

In [108]:
y1 = torch.max(y,dim=1, keepdim=False)
y1[0].shape
y1[0]

tensor([[0.5407, 0.5088, 0.5020,  ..., 0.5761, 0.5368, 0.5071],
        [0.5313, 0.5174, 0.5093,  ..., 0.5790, 0.5368, 0.5216],
        [0.5229, 0.5091, 0.5044,  ..., 0.5635, 0.5368, 0.5216],
        ...,
        [0.5287, 0.5149, 0.5059,  ..., 0.5761, 0.5368, 0.5220],
        [0.5343, 0.5149, 0.5019,  ..., 0.5761, 0.5368, 0.5216],
        [0.5522, 0.5140, 0.5208,  ..., 0.5642, 0.5368, 0.5078]],
       grad_fn=<MaxBackward0>)

In [None]:
# number of epochs to train the model
n_epochs = 20

for epoch in range(1, n_epochs+1):
    # monitor training loss
    train_loss = 0.0
    
    ###################
    # train the model #
    ###################
    for data in train_loader:
        # _ stands in for labels, here
        descriptions,_ = data
        descriptions= descriptions.view(descriptions.size(0), -1)
        # clear the gradients of all optimized variables
        descriptions = torch.autograd.Variable(descriptions)
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        outputs = model(descriptions)
        outputs = torch.max(outputs,dim=1)
        # calculate the loss
        loss = criterion(outputs[0], descriptions.float())
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item()*descriptions.size(0)
            
    # print avg training statistics 
    train_loss = train_loss/len(train_loader)
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch, 
        train_loss
        ))


Epoch: 1 	Training Loss: 308110183.018868
Epoch: 2 	Training Loss: 308110181.320755
Epoch: 3 	Training Loss: 308110181.981132
Epoch: 4 	Training Loss: 308110181.792453
Epoch: 5 	Training Loss: 308110181.132075
Epoch: 6 	Training Loss: 308110180.188679
