In [67]:
import torch
import pandas as pd
import json
import numpy as np
from transformers import BertTokenizer

In [64]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1080


In [65]:
with open("../../datasets/post-questions.json", 'r') as f:
    df = pd.read_json(json.load(f))

questions = df[["id", "title", "tags"]]
questions = questions.rename(columns={"id":"__id__", "title":"__title__","tags":"__tags__"})
print(f"Number of training questions: {len(questions)}")
questions.sample(5)

Number of training questions: 10000


Unnamed: 0,__id__,__title__,__tags__
302,61925401,How to make typescript transfer .pug file to c...,typescript|firebase|google-cloud-functions|pug
7625,61979054,Why does it show Jetty?,java|scala|jetty|http4s
1938,61828582,Assign service principal Admin Role on Service...,azure-service-fabric
164,61903553,Adding custom properties for each request in A...,asp.net|vb.net|azure-application-insights|c#-t...
2619,61920007,How to make Jest spOn second call of function,javascript|node.js|unit-testing|jestjs


In [66]:
#now we set up our categorical variables that we will be predicting. This will end up being a huge dataframe with a column for every unique tag and a row for every value, however with this we will be able to accurately predict which categories every title belongs to
tags = []
for group in questions.__tags__.values:
    tag_list = group.split('|')
    for tag in tag_list:
        if (tag not in tags):
            tags.append(tag)
print(f"The number of unique tags is {len(tags)}")

for tag in tags:
    questions[tag] = 0

for index, value in enumerate(questions.__tags__.values):
    tag_list = value.split('|')
    for tag in tag_list:
        questions[tag][index] = 1

The number of unique tags is 5308


In [79]:
titles = questions.__title__.values

# labels is going to be a list of lists with each index corresponding to a classification
labels = []
classification = questions.drop(["__id__", "__title__", "__tags__"], axis=1)


linux  split  rename  nested-loops  api  symfony4  sonata-admin  django  \
0      1      1       1             1    0         0             0       0   

   struct  javascript  ...  sub-array  xcode11.4  hibernate-mapping  \
0       0           0  ...          0          0                  0   

   google-cloud-endpoints-v2  office365-apps  rhandsontable  condor  \
0                          0               0              0       0   

   augmented-reality  agora.io  reflect  
0                  0         0        0  

[1 rows x 5308 columns]


In [72]:
print("Loading BERT Tokenizer")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT Tokenizer


In [73]:
# Print the original sentence.
print(' Original: ', titles[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(titles[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(titles[0])))

Original:  rename files which produced by split
Tokenized:  ['ren', '##ame', 'files', 'which', 'produced', 'by', 'split']
Token IDs:  [14916, 14074, 6764, 2029, 2550, 2011, 3975]


In [76]:
max_len = 64

for title in titles:
    input_ids = tokenizer.encode(title, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
    
max_len = min(max_len, 512) #BERT's maximum input length
print(f"Max sentence length is {max_len} tokens")

Max sentence length is 64 tokens


In [None]:
input_ids = []
attention_masks = []

for title in titles:
        # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
        title,
        add_special_tokens=True,
        max_length=max_len,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    # Add the encoded sentence to the list.  
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

# Convert lists back into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

