## 1. Install libraries

In [None]:
!pip install transformers==2.9.0



In [None]:
# Check we have a GPU and check the memory size of the GUP
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



## 2. Prepare Model

In [None]:

import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')


In [None]:
# optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in t5_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in t5_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)



In [None]:
from IPython.display import HTML, display
def progress(loss,value, max=100):
 return HTML(""" Batch loss :{loss}
      <progress    
value='{value}'max='{max}',style='width: 100%'>{value}
      </progress>
             """.format(loss=loss,value=value, max=max))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

job_data = pd.read_excel('/content/drive/My Drive/JobDescriptionPrediction/Resources/Data/interim/whyus_dataset.xlsx')
job_data = job_data[['JobID', 'Company', 'Location', 'Output']]
job_data.head()

Unnamed: 0,JobID,Job_Title,Skill,Output
0,1,data scientist,"sap, sql",['any experience in statistical modeling field...
1,2,data scientist,"machine learning, r, sas, sql, python","['spss, sas, stata, r) required.experience wit..."
2,3,data scientist,"data mining, data management, r, sas, sql, sta...",['ddi focuses on developing the workforce with...
3,4,graduate studies program data scientist,certified internal auditor,['[assist in consultations with business partn...
4,5,data scientist i,"statistical software, time management, r, micr...",['[collecting and combining data from multiple...


In [None]:
job_data['vocab_skills'] = job_data['Company'] + " " + job_data['Location'] 

## 3. Train Loop

In [None]:
training_data = job_data[['vocab_skills', 'Output']]
training_data = training_data.dropna(axis = 0)
training_data.shape

(977, 2)

In [None]:
tuples = [tuple(x) for x in training_data.to_numpy()]


In [None]:
t5_model.train()

epochs = 4
epoch_count = 0

for epoch in range(epochs):
  print ("epoch ",epoch)
  i = 0;
  for input,output in tuples:
    i = i+1
    print('     iteration :: ',i)
    input_sent = 'Skill: '+input+ " </s>"
    ouput_sent = output+" </s>"
    # tokenized_inp = tokenizer.encode_plus(input_sent,  max_length=96, pad_to_max_length=True,return_tensors="pt")
    # tokenized_output = tokenizer.encode_plus(ouput_sent, max_length=96, pad_to_max_length=True,return_tensors="pt")
    tokenized_inp = tokenizer.encode_plus(input_sent, return_tensors="pt")
    tokenized_output = tokenizer.encode_plus(ouput_sent,max_length=512, pad_to_max_length=True, return_tensors="pt")


    input_ids  = tokenized_inp["input_ids"]
    attention_mask = tokenized_inp["attention_mask"]

    lm_labels= tokenized_output["input_ids"]
    decoder_attention_mask=  tokenized_output["attention_mask"]


    # the forward function automatically creates the correct decoder_input_ids
    output = t5_model(input_ids=input_ids, lm_labels=lm_labels,decoder_attention_mask=decoder_attention_mask,attention_mask=attention_mask)
    loss = output[0]

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  t5_model.save_pretrained("/content/drive/My Drive/JobDescriptionPrediction/Resources/model/T5-Aboutcompany")
  epoch_count = epoch_count+1




epoch  0
     iteration ::  1
     iteration ::  2
     iteration ::  3
     iteration ::  4
     iteration ::  5
     iteration ::  6
     iteration ::  7
     iteration ::  8
     iteration ::  9
     iteration ::  10
     iteration ::  11
     iteration ::  12
     iteration ::  13
     iteration ::  14
     iteration ::  15
     iteration ::  16
     iteration ::  17
     iteration ::  18
     iteration ::  19
     iteration ::  20
     iteration ::  21
     iteration ::  22
     iteration ::  23
     iteration ::  24
     iteration ::  25
     iteration ::  26
     iteration ::  27
     iteration ::  28
     iteration ::  29
     iteration ::  30
     iteration ::  31
     iteration ::  32
     iteration ::  33
     iteration ::  34
     iteration ::  35
     iteration ::  36
     iteration ::  37
     iteration ::  38
     iteration ::  39
     iteration ::  40
     iteration ::  41
     iteration ::  42
     iteration ::  43
     iteration ::  44
     iteration ::  45
     itera

In [None]:
t5_model.save_pretrained("/content/drive/My Drive/JobDescriptionPrediction/Resources/model/T5-Aboutcompany")

## 4. Test model

In [None]:
test_sent = 'Skill: data scientist sap sql python java</s>'
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    # min_lenght=150,
     max_length=200,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=1 ,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=False,clean_up_tokenization_spaces=True)
    print (sent)

In [None]:
test_sent = 'Skill: data engineer machine learning, r, sas, sql, python </s>'
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

t5_model.eval()
beam_outputs = t5_model.generate(
    input_ids=test_input_ids,attention_mask=test_attention_mask,
    max_length=100,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=1,
    no_repeat_ngram_size=2
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)