Notebook to try CNN and DistilBERT models on the dataset

In [14]:
#imports
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig, AdamW
import torch

In [10]:
#import data
data = pd.read_json('../data/jsonformatter.json')
data.head()

Unnamed: 0,text,label
0,"db.mycol.find({$and:[{""by"":""tutorials point""},...",0
1,"db.collection('users').findOne({""""username"""": ...",1
2,"db.Document.find({ ""type"": { ""$gte"": """" } })",1
3,"db.Document.find({ ""type"": { ""$ne: 0 """" } })",1
4,"db.books.insert({ title: 'The Hobbit', author:...",0


#### Preprocess the data

In [11]:
# Check for missing values
print(data.isnull().sum())

text     0
label    0
dtype: int64


In [12]:
# tokenize the data

from transformers import DistilBertTokenizer
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

input_ids = []
attention_masks = []

for query in data['text']:
    encoded_dict = tokenizer.encode_plus(
                        query,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(data['label'].values)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Test and Train 

In [13]:
# Split the data
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_labels, val_labels, train_masks, val_masks = train_test_split(input_ids, labels, attention_masks, test_size=0.2)

train_masks,val_masks, _, _ = train_test_split(attention_masks, labels, test_size=0.2)


Create the DataLoader for the training set

In [15]:
batch_size = 32
#train
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
#test
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

 Model Training and Evaluation