In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/software-classification/train.csv
/kaggle/input/software-classification/test.csv


In [2]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from transformers import (AutoModel,AutoModelForMaskedLM, AutoTokenizer, LineByLineTextDataset,
                         DataCollatorForLanguageModeling,Trainer, TrainingArguments,)

In [3]:
train_data = pd.read_csv('../input/software-classification/train.csv')
test_data = pd.read_csv('../input/software-classification/test.csv')

In [4]:
train_data.subcategory.unique()

array(['EMEA_SAP BW/BO', 'EMEA_PL-Local Server', 'EMEA_eSpace',
       'EMEA_FR-GC-Mix', 'EMEA_SAP MAN'], dtype=object)

In [5]:
category = ['EMEA_PL-Local Server', 'EMEA_eSpace', 'EMEA_SAP MAN']

In [6]:
train_data = train_data[train_data.subcategory.isin(category)]
test_data = test_data[test_data.subcategory.isin(category)]
test_data.reset_index(drop=True,inplace=True)
train_data.reset_index(drop=True,inplace=True)

In [7]:
train_data

Unnamed: 0,text,subcategory
0,proxy,EMEA_PL-Local Server
1,mails sent to oneservice service now com are f...,EMEA_eSpace
2,damaged cable charging xperia xz,EMEA_PL-Local Server
3,this issue is related to authorizations hello ...,EMEA_eSpace
4,leave request section doesn t work,EMEA_eSpace
...,...,...
505,issue hardware software connection launch aure...,EMEA_PL-Local Server
506,user lcorosov can t access espace portal,EMEA_eSpace
507,user has critical error trying to access vacat...,EMEA_eSpace
508,issue aktywacja karty dla cok,EMEA_PL-Local Server


In [8]:
test_data

Unnamed: 0,text,subcategory
0,proxy,EMEA_PL-Local Server
1,mails sent to oneservice service now com are f...,EMEA_eSpace
2,damaged cable charging xperia xz,EMEA_PL-Local Server
3,this issue is related to authorizations hello ...,EMEA_eSpace
4,leave request section doesn t work,EMEA_eSpace
...,...,...
505,issue hardware software connection launch aure...,EMEA_PL-Local Server
506,user lcorosov can t access espace portal,EMEA_eSpace
507,user has critical error trying to access vacat...,EMEA_eSpace
508,issue aktywacja karty dla cok,EMEA_PL-Local Server


In [None]:
text  = '\n'.join(train_data.text.tolist() + test_data.text.tolist())

with open('text.txt','w') as f:
    f.write(text)

In [None]:
model_name = 'roberta-large'
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./text.txt", #mention text file here
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./Output_DIR", #select model path
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    eval_steps=20,
    metric_for_best_model = 'eval_loss',
    greater_is_better=False,
    load_best_model_at_end =True,
    prediction_loss_only=True,
    report_to = "none" 
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset)

In [None]:
trainer.train()

In [None]:
trainer.save_model('./Output_DIR')

In [9]:
# Roberta + SVM: https://www.kaggle.com/maunish/clrp-roberta-svm
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer,
                          get_linear_schedule_with_warmup,
                          get_cosine_schedule_with_warmup)

from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [10]:
train_data

Unnamed: 0,text,subcategory
0,proxy,EMEA_PL-Local Server
1,mails sent to oneservice service now com are f...,EMEA_eSpace
2,damaged cable charging xperia xz,EMEA_PL-Local Server
3,this issue is related to authorizations hello ...,EMEA_eSpace
4,leave request section doesn t work,EMEA_eSpace
...,...,...
505,issue hardware software connection launch aure...,EMEA_PL-Local Server
506,user lcorosov can t access espace portal,EMEA_eSpace
507,user has critical error trying to access vacat...,EMEA_eSpace
508,issue aktywacja karty dla cok,EMEA_PL-Local Server


In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_data['subcategory'])

LabelEncoder()

In [12]:
le.classes_

array(['EMEA_PL-Local Server', 'EMEA_SAP MAN', 'EMEA_eSpace'],
      dtype=object)

In [13]:
encoded_label_train = le.transform(train_data['subcategory'])
encoded_label_test = le.transform(test_data['subcategory'])

In [14]:
train_data['target'] = encoded_label_train
test_data['target']  = encoded_label_test

In [15]:
train_data

Unnamed: 0,text,subcategory,target
0,proxy,EMEA_PL-Local Server,0
1,mails sent to oneservice service now com are f...,EMEA_eSpace,2
2,damaged cable charging xperia xz,EMEA_PL-Local Server,0
3,this issue is related to authorizations hello ...,EMEA_eSpace,2
4,leave request section doesn t work,EMEA_eSpace,2
...,...,...,...
505,issue hardware software connection launch aure...,EMEA_PL-Local Server,0
506,user lcorosov can t access espace portal,EMEA_eSpace,2
507,user has critical error trying to access vacat...,EMEA_eSpace,2
508,issue aktywacja karty dla cok,EMEA_PL-Local Server,0


In [16]:
train_data = train_data
test_data = test_data


num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

bins = train_data.bins.to_numpy()
target = train_data.target.to_numpy()


In [17]:
config = {
    'lr': 2e-5,
    'wd':0.01,
    'batch_size':16,
    'valid_step':10,
    'max_len':200,
    'epochs':3,
    'nfolds':5,
    'seed':42,
}

for i in range(config['nfolds']):
    os.makedirs(f'model{i}',exist_ok=True)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

train_data['Fold'] = -1
kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=bins)):
    train_data.loc[valid_idx,'Fold'] = k


In [18]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['text'].to_numpy()
        self.targets = df['target'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        
        target = torch.tensor(self.targets[idx],dtype=torch.long) 
        return encode, target
    
    def __len__(self):
        return len(self.excerpt)

In [19]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 3)
#         self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

#         attention_weights = torch.softmax(score, dim=1)

#         context_vector = attention_weights * features
#         context_vector = torch.sum(context_vector, dim=1)

#         return context_vector
        return score


In [20]:
class Model(nn.Module):
    def __init__(self,path):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained(path)  
        self.head = AttentionHead(1024,1024)
#         self.dropout = nn.Dropout(0.1)
#         self.softmax = nn.Softmax(3)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
#         x = self.dropout(x)
#         x = self.softmax(x)
        return x

In [21]:
x_train,x_valid = train_data.query(f"Fold != 0"),train_data.query(f"Fold == 0")
print(x_train.shape)
print(x_valid.shape)
MODEL_PATH = './Output_DIR'
tokenizer = AutoTokenizer.from_pretrained('roberta-large')


train_ds = CLRPDataset(x_train,tokenizer,config['max_len'])
train_dl = DataLoader(train_ds,
                    batch_size = config["batch_size"],
                    shuffle=True,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)

valid_ds = CLRPDataset(x_valid,tokenizer,config['max_len'])
valid_dl = DataLoader(valid_ds,
                    batch_size = config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)


(408, 5)
(102, 5)


In [22]:
x_train

Unnamed: 0,text,subcategory,target,bins,Fold
0,proxy,EMEA_PL-Local Server,0,0,4
1,mails sent to oneservice service now com are f...,EMEA_eSpace,2,8,4
2,damaged cable charging xperia xz,EMEA_PL-Local Server,0,0,1
3,this issue is related to authorizations hello ...,EMEA_eSpace,2,8,3
4,leave request section doesn t work,EMEA_eSpace,2,8,2
...,...,...,...,...,...
504,can t logon to sap after migration to windows ...,EMEA_SAP MAN,1,4,1
505,issue hardware software connection launch aure...,EMEA_PL-Local Server,0,0,2
507,user has critical error trying to access vacat...,EMEA_eSpace,2,8,3
508,issue aktywacja karty dla cok,EMEA_PL-Local Server,0,0,2


In [23]:
x_valid

Unnamed: 0,text,subcategory,target,bins,Fold
5,ess leave requst incorrect supervisor,EMEA_eSpace,2,8,0
7,work oder list has no work oders at all,EMEA_SAP MAN,1,4,0
24,request separate bazalt dolomit rooms,EMEA_PL-Local Server,0,0,0
25,user unable to access their ess,EMEA_eSpace,2,8,0
36,in approval,EMEA_SAP MAN,1,4,0
...,...,...,...,...,...
491,management decided to transfer maintenance of ...,EMEA_SAP MAN,1,4,0
493,capacity displayed in shift updater is differe...,EMEA_SAP MAN,1,4,0
495,add permissions directory,EMEA_PL-Local Server,0,0,0
503,problem mail operation phone,EMEA_PL-Local Server,0,0,0


In [24]:
print(test_data.shape)
test_ds = CLRPDataset(test_data,tokenizer,config['max_len'])
test_dl = DataLoader(test_ds,
                    batch_size = config["batch_size"],
                    shuffle=False,
                    num_workers = 4,
                    pin_memory=True,
                    drop_last=False)

(510, 3)


In [25]:
train_data.shape

(510, 5)

In [26]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("./Output_DIR", num_labels=3)


from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)


from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


Some weights of the model checkpoint at ./Output_DIR were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./Output_DIR and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_pr

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
         

In [27]:
# !pip install -q datasets

In [28]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch,labels in train_dl:
#         print(batch)
#         print(labels)
#         batch = {k: v.to(device) for k, v in batch.items()}
        batch = {key:val.reshape(val.shape[0],-1).to(device) for key,val in batch.items()}
#         print(batch)
        outputs = model(**batch)
        
#         print(outputs[0])
#         print(labels)
        loss = nn.CrossEntropyLoss()(outputs[0].to(device),labels.to(device))
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/78 [00:00<?, ?it/s]

In [29]:
from datasets import load_metric       

metric= load_metric("accuracy")
model.eval()
for batch,labels in valid_dl:
#     batch = {k: v.to(device) for k, v in batch.items()}
    batch = {key:val.reshape(val.shape[0],-1).to(device) for key,val in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

{'accuracy': 0.9411764705882353}

In [30]:
predictions

tensor([1, 1, 2, 0, 0, 2], device='cuda:0')

In [32]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-4.9551,  4.7453, -0.2262],
        [-2.6653,  5.0066, -2.3833],
        [-4.6228,  1.3558,  2.8627],
        [ 4.8663, -1.8834, -2.4694],
        [ 4.9933, -2.5770, -1.8472],
        [-2.6894, -2.9468,  5.0155]], device='cuda:0'), hidden_states=None, attentions=None)

In [33]:
logits

tensor([[-4.9551,  4.7453, -0.2262],
        [-2.6653,  5.0066, -2.3833],
        [-4.6228,  1.3558,  2.8627],
        [ 4.8663, -1.8834, -2.4694],
        [ 4.9933, -2.5770, -1.8472],
        [-2.6894, -2.9468,  5.0155]], device='cuda:0')

In [34]:
from datasets import load_metric       

metric= load_metric("accuracy")
model.eval()
for batch,labels in test_dl:
#     batch = {k: v.to(device) for k, v in batch.items()}
    batch = {key:val.reshape(val.shape[0],-1).to(device) for key,val in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

{'accuracy': 0.984313725490196}

In [35]:
model?

[0;31mSignature:[0m      [0mmodel[0m[0;34m([0m[0;34m*[0m[0minput[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           RobertaForSequenceClassification
[0;31mString form:[0m   
RobertaForSequenceClassification(
           (roberta): RobertaModel(
           (embeddings): RobertaEmbeddings( <...> t(p=0.1, inplace=False)
           (out_proj): Linear(in_features=1024, out_features=3, bias=True)
           )
           )
[0;31mFile:[0m           /opt/conda/lib/python3.7/site-packages/transformers/models/roberta/modeling_roberta.py
[0;31mDocstring:[0m     
RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.


This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddi

In [38]:
type(model)

transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification

In [37]:
# save the weights of the model to a .pt file
torch.save(model.state_dict(), "./Output_DIR/saved_txt_model.pt")

In [42]:
loaded_model = AutoModelForSequenceClassification.from_pretrained("./Output_DIR", num_labels=3)
loaded_model.load_state_dict(torch.load("./Output_DIR/saved_txt_model.pt"))
# model.eval()

Some weights of the model checkpoint at ./Output_DIR were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./Output_DIR and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_pr

<All keys matched successfully>

In [43]:
loaded_model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
         

In [46]:
loaded_model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
         

In [47]:
from datasets import load_metric       

metric= load_metric("accuracy")
loaded_model.eval()
for batch,labels in test_dl:
#     batch = {k: v.to(device) for k, v in batch.items()}
    batch = {key:val.reshape(val.shape[0],-1).to(device) for key,val in batch.items()}
    with torch.no_grad():
        outputs = loaded_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

{'accuracy': 0.984313725490196}