In [None]:
import os
import gc
import sys
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel,AutoConfig,
                          AutoTokenizer,get_cosine_schedule_with_warmup)

In [None]:
ROBERTA_PATH= 'roberta-base'
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
        
        
        self.regressor = nn.Sequential(                       
            nn.Linear(768*2, 2530)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        
        last_layer_hidden_states = roberta_output.hidden_states[-1]
        
        mean_pooling_embeddings = torch.mean(last_layer_hidden_states, 1)
        max_pooling_embeddings,_ = torch.max(last_layer_hidden_states, 1)
        mean_max_embeddings = torch.cat((mean_pooling_embeddings, max_pooling_embeddings), 1)
        return mean_max_embeddings

In [None]:
model = LitModel()

In [None]:
state_dict = torch.load('../input/amzonchaleenge/title_weights/modellast.pth')

In [None]:
model.load_state_dict(state_dict['model'])

In [None]:
TOKENIZER = tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
def preprocess1(text):
  text = text.split()
  text = ' '.join(text)
  text = text.strip()
  return text

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer = TOKENIZER,max_len=150):
        self.excerpt = df['TITLE'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        
        encode1 = self.tokenizer(preprocess1(self.excerpt[idx]),
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                return_attention_mask=True,
                                truncation=True)

        
        return encode1
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
import csv

In [None]:
testdf = pd.read_csv('../input/amzonchaleenge/raw_data/train.csv',escapechar='\\',quoting = csv.QUOTE_NONE,usecols = ['TITLE'])
testdf.dropna(inplace = True)

In [None]:
start = 2600000
end = 2700000

In [None]:
testdataset = CLRPDataset(testdf[start:end])

In [None]:
valid_dl = DataLoader(testdataset,
                      batch_size = 16,
                       shuffle=False,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

In [None]:
DEVICE = torch.device('cuda')

In [None]:
def valid_loop(valid_loader,model,device = DEVICE):
        predictions = None
        model.to(device)
        model.eval()
        for i,(inputs1) in tqdm(enumerate(valid_loader),total=len(valid_loader)):
            with torch.no_grad():
                inputs1 = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs1.items()}
                outputs1 = model(**inputs1).detach().cpu().numpy()
                if i == 0:
                    predictions = outputs1
                else:
                    predictions = np.vstack((predictions,outputs1))
        return predictions

In [None]:
predictions = valid_loop(valid_dl,model)

In [None]:
np.save(f'predictions_{start}_{end}.npy',predictions)