In [None]:
import pandas as pd
import numpy as np
import csv
import os
import random
import torch
import torch
torch.backends.cudnn.benchmark = True
torch.autograd.set_detect_anomaly(False)
torch.autograd.profiler.profile(False)
torch.autograd.profiler.emit_nvtx(False)
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
import sklearn.model_selection as model_selection
from transformers import BertForSequenceClassification, BertTokenizerFast
import time
import copy
from tqdm.notebook import tqdm

In [None]:
def set_seed(seed=42):
    os.environ['PYTHONHASHSEED']=str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [None]:
df=pd.read_csv("../input/amazon-ml-challenge-2021-hackerearth/train.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)

In [None]:
num_labels=df["BROWSE_NODE_ID"].nunique()

In [None]:
id2lbl={lbl: idx for idx,lbl in enumerate(list(df["BROWSE_NODE_ID"].unique()))}
lbl2id={lbl:idx for idx,lbl in id2lbl.items()}

In [None]:
set_seed()

In [None]:
df["BROWSE_NODE_ID"]=df["BROWSE_NODE_ID"].map(id2lbl)

In [None]:
def create_folds(data, num_splits):
    data["kfold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    y=df["BROWSE_NODE_ID"]
    kf = model_selection.StratifiedKFold(n_splits=num_splits)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=y)):
        data.loc[v_, 'kfold'] = f
    return data

In [None]:
df=create_folds(df, 5)

In [None]:
df=df.loc[df.kfold.isin([0,2])]
df=df.reset_index(drop=True)
df.head()

In [None]:
df.loc[df['BROWSE_NODE_ID']==1045].head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
temp=df.dropna(subset=['TITLE'])
temp=temp.reset_index(drop=True)

In [None]:
temp.isnull().sum()

In [None]:
total_words=0
for title in temp['TITLE']:
    total_words+=len(title.split())

In [None]:
total_words/len(temp)

In [None]:
temp=df.dropna(subset=['DESCRIPTION'])
total_words=0
for title in temp['DESCRIPTION']:
    total_words+=len(title.split())
total_words/len(temp)

In [None]:
temp=df.dropna(subset=['TITLE'])
temp=temp.fillna(" ")

In [None]:
temp.head()

In [None]:
" ".join(temp["BULLET_POINTS"][0].split(","))[:-1][1:]

In [None]:
wholeSentence=[]
for idx,row in temp.iterrows(): 
    if(idx%100000==0):
        print(f"{idx} Done")
    wholeSentence.append(row[0]+row[1]+" ".join(row[2].split(","))[:-1][1:])

In [None]:
temp["WHOLE SENTENCE"]=wholeSentence

In [None]:
temp=temp.reset_index(drop=True)

In [None]:
temp.head()

# BERT

In [None]:
dev=torch.device('cuda')

In [None]:
temp["BROWSE_NODE_ID"].value_counts()

In [None]:
train_text, val_text, train_labels, val_labels = train_test_split(temp['WHOLE SENTENCE'], temp['BROWSE_NODE_ID'],
                                                                    test_size=0.05)

In [None]:
train_text=train_text.reset_index(drop=True)
train_labels=train_labels.reset_index(drop=True)
val_text=val_text.reset_index(drop=True)
val_labels=val_labels.reset_index(drop=True)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
seq_len = [len(i.split()) for i in train_text]

pd.Series(seq_len).hist(bins = 30)

In [None]:
max_seq_len = 64

In [None]:
class amazonDataset(Dataset):
  def __init__(self,text,label,tokenizer):
    self.sentence=text
    self.label=label
    self.tokenizer=tokenizer

  def __len__(self):
    return len(self.sentence)
  
  def __getitem__(self,idx):
    inp_tokens=self.tokenizer.encode_plus(self.sentence[idx], 
                                          padding="max_length", 
                                          add_special_tokens=True,
                                          max_length=max_seq_len, 
                                          truncation=True)
    inp_id=inp_tokens.input_ids
    inp_mask=inp_tokens.attention_mask
    inp_type_ids=inp_tokens.token_type_ids
    labels=self.label[idx]

    return {
#         "text":self.sentence,
        "input_ids":torch.tensor(inp_id, dtype=torch.long),
        "input_attention_mask":torch.tensor(inp_mask, dtype=torch.long),
        "input_type_ids":torch.tensor(inp_type_ids, dtype=torch.long),
        "labels":torch.tensor(labels, dtype=torch.float)
    }

In [None]:
train_dataset = amazonDataset(train_text, train_labels, tokenizer)
val_dataset = amazonDataset(val_text, val_labels, tokenizer)

In [None]:
train_dataloader=DataLoader(train_dataset,
                            batch_size=128,
                            shuffle=True,
                            num_workers=2,
                           pin_memory=True)
val_dataloader=DataLoader(val_dataset,
                            batch_size=128,
                            shuffle=False,
                            num_workers=2,
                           pin_memory=True)

In [None]:
dataloaders={'train':train_dataloader, 'eval':val_dataloader }
dataset_sizes={'train':len(train_dataset), 'eval':len(val_dataset)}

In [None]:
# class BERTBaseUncased(nn.Module):
#     def __init__(self):
#         super(BERTBaseUncased, self).__init__()
#         self.bert=AutoModel.from_pretrained('bert-base-uncased')
#         self.dropout = nn.Dropout(0.1)
#         self.relu =  nn.ReLU()
#         self.fc1 = nn.Linear(768,9919)
        
#     def forward(self,ids,mask,token_type_ids):
#         a, o2 = self.bert(
#             ids,
#             attention_mask=mask,
#             token_type_ids=token_type_ids)
#         bo=self.dropout(o2)
#         output=self.fc1(bo)
#         return output

In [None]:
model=BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                    num_labels=9919)
print(model)
model=torch.load("../input/amazon-ml-models/BERTBaseBaselineNoset.pth")
model.to(dev)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
def train_fn(model,loss_fn,optimizer,scheduler,num_epochs=1):
    since=time.time()
    best_wts=copy.deepcopy(model.state_dict())
    best_loss=float('inf')
    for epoch in range(num_epochs):
        print(f'Epoch:{epoch}/{num_epochs}')
        print('-'*10)
        
        for mode in ['train','eval']:
            if mode=='train':
                model.train()
            elif mode=='eval':
                model.eval()
            
            running_loss=0.0
            running_corrects=0.0
            
            for data in tqdm(dataloaders[mode]):
                input_ids = data["input_ids"].to(dev, dtype=torch.long)
                labels = data['labels'].to(dev, dtype=torch.long)
                mask = data["input_attention_mask"].to(dev, dtype=torch.long)
                token_type_ids = data['input_type_ids'].to(dev, dtype=torch.long)
            
                optimizer.zero_grad()
                with torch.set_grad_enabled(mode=='train'):
                    outputs=model(
                                input_ids =input_ids,
                                attention_mask=mask,
                                token_type_ids=token_type_ids,
                                labels=labels
                            )
                    loss, logits=outputs.loss, outputs.logits
                    _,preds=torch.max(logits,1)
                    
                    if mode=='train':
                        loss.backward()
                        optimizer.step()                    
                    running_loss += loss.item()                    
                    running_corrects += torch.sum(preds == labels.data)

            if mode == 'train':
                scheduler.step()
                
            epoch_loss=running_loss/dataset_sizes[mode]
            epoch_accuracy=running_corrects.double()/dataset_sizes[mode]
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                mode, epoch_loss, epoch_accuracy))
            
            if mode=='eval' and epoch_loss<best_loss:
                best_wts=copy.deepcopy(model.state_dict())
                best_acc=epoch_accuracy
                best_loss=epoch_loss
            
            print()

        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print('Best val loss: {:4f}'.format(best_loss))
        print('Best val Acc: {:4f}'.format(best_acc))
    
        model.load_state_dict(best_wts)
    return model

In [None]:
model = train_fn(model, 
               criterion, 
               optimizer, 
               exp_lr_scheduler,
               num_epochs=5)

In [None]:
torch.save(model,"BERTBaseBaselineNoset.pth")