In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import shutil
import sys
import re
import html
import string
import matplotlib.pyplot as plt
from tqdm import tqdm 

In [2]:
train = pd.read_csv("/kaggle/input/kriti-private/test.csv")
train.head()

Unnamed: 0,Id,Title,Abstract
0,30332,Pricing FX Options under Intermediate Currency,We suggest an intermediate currency approach t...
1,50337,A Multicore Processor based Real-Time System f...,In this paper we propose an Intelligent Manage...
2,66515,Perceptual Quality Improvement in Videoconfere...,"In the latest years, videoconferencing has tak..."
3,57464,Hundred-Kilobyte Lookup Tables for Efficient S...,Conventional super-resolution (SR) schemes mak...
4,43169,Efficient Sequence Labeling with Actor-Critic ...,Neural approaches to sequence labeling often u...


In [3]:
labels = ['math.AT', 'stat.AP', 'cs.AR', 'math.QA', 'q-bio.MN', 'eess.AS',
       'eess.IV', 'stat.ME', 'econ.GN', 'eess.SP', 'q-fin.RM', 'cs.LG',
       'cs.CR', 'q-bio.BM', 'q-fin.GN', 'q-fin.MF', 'q-fin.PR', 'math.CV',
       'cs.LO', 'econ.TH', 'math.CO', 'cs.AI', 'math.AC', 'q-bio.CB',
       'q-fin.CP', 'cs.CL', 'cs.DC', 'math.LO', 'math.NT', 'cs.SD', 'q-fin.TR',
       'cs.CV', 'stat.ML', 'q-fin.EC', 'econ.EM', 'cs.CE', 'stat.CO',
       'math.PR', 'q-bio.NC', 'math.AP', 'cs.OS', 'cs.NI', 'cs.IT', 'cs.PL',
       'cs.GT', 'cs.DM', 'math.IT', 'cs.SE', 'cs.RO', 'stat.TH', 'cs.DB',
       'math.ST', 'q-bio.GN', 'q-fin.PM', 'q-bio.TO', 'math.GR', 'cs.IR']

In [4]:
def to_lowercase(text):
    return text.lower()

def remove_punctuation(text):
    """Remove punctuation from list of tokenized words"""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+', '', text)

def remove_whitespaces(text):
    return text.strip()

def clean_text( text):
    #text = remove_special_chars(text)
    text = remove_punctuation(text)
    text = to_lowercase(text)
    text = replace_numbers(text)
    
    return ''.join(text)

In [5]:
train['Abstract'] = train['Abstract'].apply(lambda x: clean_text(x))

In [6]:
!pip install transformers



In [7]:
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel, AutoConfig

In [8]:
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/tokenizer/kaggle/working/')

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [10]:
class ImprovedBERTClass(nn.Module):
    def __init__(self, num_classes=57, dropout_rate=0.1):
        super(ImprovedBERTClass, self).__init__()
        self.bert_model =  AutoModel.from_pretrained('/kaggle/input/tokenizer/kaggle/working/')
        self.dropout = nn.Dropout(dropout_rate)
        self.linear1 = nn.Linear(768, 512)
        self.linear2 = nn.Linear(512, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert_model(input_ids, attention_mask, token_type_ids)
        output = output.pooler_output
        output = self.dropout(output)
        output = self.linear1(output)
        output = nn.functional.relu(output)
        output = self.dropout(output)
        output = self.linear2(output)
        return output

model = ImprovedBERTClass()
model.to(device)   

ImprovedBERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [11]:
state_dict = torch.load('/kaggle/input/weights/weights.pth')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [12]:
def testing(text):
    inputs = tokenizer.encode_plus(
         text,
         None,
         add_special_tokens=True,
         max_length=512,
         padding='max_length',
         return_token_type_ids=True,
         truncation=True,
         return_attention_mask=True,
         return_tensors='pt'
     )

    model.eval()
    with torch.no_grad():
        input_ids = inputs['input_ids'].to(device, dtype=torch.long)
        attention_mask = inputs['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = inputs['token_type_ids'].to(device, dtype=torch.long)
        output = model(input_ids, attention_mask, token_type_ids)
        final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
        
    return final_output

In [13]:
output = []
for i in range(len(train['Abstract'])):
    output.append(testing(train['Abstract'].loc[i]))

In [14]:
len(output)

10974

In [15]:
for i in range(len(output)):
    output[i][0] = [1 if j>0.5 else 0 for j in output[i][0]]

In [16]:
df = pd.DataFrame(columns = labels)
df['Id'] = train['Id']

In [17]:
for i in range(len(df)):
    if i%1000==0:
        print(i)
    df.loc[i,labels] = output[i][0]

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [18]:
df

Unnamed: 0,math.AT,stat.AP,cs.AR,math.QA,q-bio.MN,eess.AS,eess.IV,stat.ME,econ.GN,eess.SP,...,cs.RO,stat.TH,cs.DB,math.ST,q-bio.GN,q-fin.PM,q-bio.TO,math.GR,cs.IR,Id
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,30332
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,50337
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,66515
3,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,57464
4,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,43169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10969,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,41708
10970,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,38843
10971,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,57571
10972,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,31964


In [19]:
df.to_csv('/kaggle/working/Disang_boys.csv',index=False)