In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import math
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
for i in range(torch.cuda.device_count()):
    print(i, torch.cuda.get_device_name(i))

0 NVIDIA GeForce RTX 3090


In [3]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda:0


In [1]:
batch_size = 16
n_epochs = 500
model_no = 'transformer_with_word2vec'
exp = 1

In [32]:
class custom_transformer(nn.Module):

    def __init__(self, no_out_vect = 128, num_classes=128, hidden_dim=512, nheads=8, num_encoder_layers=5, 
                 num_decoder_layers=5):
        super(custom_transformer, self).__init__()
        self.hidden_dim = hidden_dim
#         self.transformer = nn.Transformer(hidden_dim, nheads, num_encoder_layers, num_decoder_layers, 
#                                           batch_first=True, activation="relu")
        self.encoder = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nheads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder, num_layers = num_encoder_layers)
        
        self.decoder = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nheads)
        self.transformer_decoder = nn.TransformerEncoder(self.encoder, num_layers = num_encoder_layers)
        
        self.relu_layer = nn.ReLU()
        self.sigmoid_layer = nn.Sigmoid()
        
    def positionalencoding1d(self, d_model, length):
        """
        :param d_model: dimension of the model
        :param length: length of positions
        :return: length*d_model position matrix
        """
        if d_model % 2 != 0:
            raise ValueError("Cannot use sin/cos positional encoding with "
                             "odd dim (got dim={:d})".format(d_model))
        pe = torch.zeros(length, d_model)
        position = torch.arange(0, length).unsqueeze(1)
        div_term = torch.exp((torch.arange(0, d_model, 2, dtype=torch.float) *
                             -(math.log(10000.0) / d_model)))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)

        return pe

    def forward(self, feat_input):
        feat_input = feat_input.flatten(2).permute(0, 2, 1)

        feat_input += self.positionalencoding1d(self.hidden_dim, feat_input.shape[-2]).repeat(feat_input.shape[0], 1, 1)
#         features = self.transformer(feat_input.cuda(), self.learnable_query.repeat(feat_input.shape[0], 1, 1))
        enc_features = self.transformer_encoder(feat_input)
        dec_features = self.transformer_decoder(feat_input, enc_features)
        features = self.linear1(features.flatten(1))
        features = self.sigmoid_layer(features)

        return features

In [33]:
new_model = custom_transformer()
# new_model.to(device)
print("Total trainable params:", torch.nn.utils.parameters_to_vector([p for p in new_model.parameters() if p.requires_grad]).numel())

Total trainable params: 37828608




In [34]:
new_model

custom_transformer(
  (encoder): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (linear1): Linear(in_features=512, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=512, bias=True)
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-4): 5 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
    

In [20]:
# Keeping Stop Words

In [9]:
from torch.utils.data import Dataset, DataLoader

class dataset_loader(Dataset):
    
    def __init__(self, root_dir, corpus_dir, word2vec_dir):
        self.root_dir = root_dir
        self.corpus_dir = corpus_dir
        self.word2vec_all = json.load(self.word2vec_dir)
        self.text = pd.read_csv(self.corpus_dir + 'gen_data.csv')['text']
        self.label = pd.read_csv(self.corpus_dir + 'gen_data.csv')['label']
        
    def __len__(self):
        return 0
    
    def __getitem__(self, idx):
        row_text = self.text[idx].lower().replace('\n', ' ').split()
        row_label = self.label
        word2vec_matrix = []
        for i in row_text:
            word2vec_matrix.append(self.word2vec_all[i])
        for i in range(20000 - len(word2vec_matrix)):
            word2vec_matrix.append(np.zeros(100))
        for i in row_label:
            word2vec_matrix.append(self.word2vec_all[i])
        for i in range(20500 - len(word2vec_matrix)):
            word2vec_matrix.append(np.zeros(100))
            

In [4]:
data = pd.read_excel('/home/abhijeet/Desktop/TRIZ/All_data/CPC Data/generated_data.xlsx')

In [37]:
data['labels']

0         ['A44B19/00', 'A41D10/00', 'A44B19/26', 'A41D1...
1                           ['A41D19/01517', 'A41D19/0062']
2                                 ['A42B3/12', 'A42B3/324']
3         ['E03D1/32', 'E03D5/10', 'B61D35/005', 'E03D5/...
4                            ['E03C1/0401', 'Y10T137/6977']
                                ...                        
175500                                                  NaN
175501                                                  NaN
175502                                                  NaN
175503                                                  NaN
175504                                                  NaN
Name: labels, Length: 175505, dtype: object

In [5]:
text = data['text']
label = data['labels']

In [56]:
eval(label[85410])

['C11D3/14',
 'B05D7/532',
 'C11D17/0013',
 'C11D3/3776',
 'C11D11/0058',
 'B05D7/14',
 'Y10T428/25',
 'C02F2001/422',
 'C11D11/0023',
 'B05D2601/20',
 'B08B3/026',
 'C08K3/346',
 'C02F1/42',
 'B05D3/02',
 'C08K3/22',
 'C09D1/00',
 'B01J39/05',
 'C11D3/1253',
 'C11D3/1213',
 'C02F2001/425',
 'C09D7/68',
 'C11D3/1266',
 'C11D11/0064',
 'C09D7/61',
 'C09D7/67',
 'B05D1/04']

In [57]:
max_len = 0
all_len = 0
for i in label:
#     print(i)
    try:
        i = eval(i)
        if len(i) > max_len:
            max_len = len(i)
        all_len += len(i)
#     break
    except:
        pass
print(max_len)
print(all_len/len(label))

333
5.747141107090966


In [19]:
text[0].lower().replace('\n', ' ').split()

['background',
 '1.',
 'field',
 'of',
 'the',
 'invention',
 'the',
 'present',
 'invention',
 'relates',
 'generally',
 'to',
 'methods',
 'and',
 'systems',
 'for',
 'transforming,',
 'via',
 'a',
 'fastening',
 'mechanism,',
 'a',
 'volume',
 'or',
 'compartment',
 'of',
 'material',
 'into',
 'varied',
 'shapes',
 'or',
 'configurations.',
 'more',
 'particularly,',
 'the',
 'present',
 'invention',
 'relates',
 'to',
 'a',
 'transformable',
 'volume',
 'of',
 'material',
 'that',
 'can',
 'be',
 'selectively',
 'divided',
 'by',
 'a',
 'slider',
 'that',
 'connects',
 'two',
 'sides',
 'of',
 'material',
 'while',
 'simultaneously',
 'disconnecting',
 'two',
 'other',
 'sides',
 'of',
 'material.',
 '2.',
 'background',
 'information',
 'many',
 'existing',
 'systems',
 'that',
 'provide',
 'for',
 'alternative',
 'configurations',
 'of',
 'volumes',
 'of',
 'material',
 'involve',
 'the',
 'use',
 'of',
 'fasteners.',
 'one',
 'popular',
 'type',
 'of',
 'fastener',
 'often',
 '