In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics

In [2]:
# Load data
data = pd.read_csv('/kaggle/input/lstm-50-inference/old_training.csv')
data_test = pd.read_csv('/kaggle/input/lstm-50-inference/test_data.csv')

In [3]:
data =data[(data['IS_SARCASTIC']=='0') | (data['IS_SARCASTIC']=='1')]
data['IS_SARCASTIC'] = data['IS_SARCASTIC'].astype(int)

In [4]:
data_test = data_test[['ID','text']].dropna()

In [5]:
data_test.reset_index(drop=True)

Unnamed: 0,ID,text
0,27927,states slow to shut down weak teacher educatio...
1,1660,drone places fresh kill on steps of white house
2,96,report: majority of instances of people gettin...
3,6237,"sole remaining lung filled with rich, satisfyi..."
4,6650,the gop's stockholm syndrome
...,...,...
5719,10004,donald trump jr.'s thanksgiving conversation s...
5720,9698,hellmann's heir's conduct unbefitting a mayonn...
5721,5001,maryland teen allegedly sexually abused child ...
5722,27749,ryan coogler would love to see a women of waka...


In [6]:
data = data[['TEXT ', 'IS_SARCASTIC']].dropna()

In [7]:
data.reset_index(drop=True)

Unnamed: 0,TEXT,IS_SARCASTIC
0,it's another ho-ho-horowitz christmas!,0
1,quiz: where should you live abroad?,0
2,praise the lord or praise the person?,0
3,microwave-resistant potato alarms scientists,1
4,rude guy unfortunately says something funny,1
...,...,...
25677,u.s. changes motto to 'america... we're gonna ...,1
25678,why this healing expert doesn't believe in 'cl...,0
25679,man just going to assume apartment has functio...,1
25680,5 signs that you should end your relationship,0


In [8]:
# Prepare dataset
class SarcasmDataset(Dataset):
    def __init__(self, ids, texts, labels, tokenizer, vocab, max_len):
        self.texts = texts  
        self.labels = labels
        self.ids = ids
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        if(self.labels is not None):
            label = self.labels[idx]
        else:
            ids = self.ids[idx]
        tokens = self.tokenizer(text)
        tokens = [self.vocab[token] for token in tokens]
        if len(tokens) < self.max_len:
            tokens += [self.vocab['<pad>']] * (self.max_len - len(tokens))
        if(self.labels is not None):
            return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.float)
        else:
            return torch.tensor(ids, dtype=torch.long), torch.tensor(tokens, dtype=torch.int)

In [9]:
# Tokenizer and Vocabulary
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(data['TEXT ']), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [10]:
max_len = 100

In [11]:
test_texts = data_test['text']
test_ids = data_test['ID']
test_dataset = SarcasmDataset(test_ids,test_texts, None, tokenizer, vocab, max_len)

In [12]:
test_dataset[0]

(tensor(27927),
 tensor([ 436, 3157,    3, 1234,   87, 2784,  353,  521, 2907,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1], dtype=torch.int32))

In [13]:
# DataLoader
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [14]:
# Model
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, num_layers, bidirectional, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.act = nn.Sigmoid()

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        dense_outputs = self.fc(hidden)
        outputs = self.act(dense_outputs)
        return outputs

In [15]:
# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [16]:
model = BiLSTM(len(vocab), 100, 256, 1, 2, True, 0.5).to(device)

In [17]:
model.load_state_dict(torch.load('/kaggle/input/lstm-50-inference/lstm_model_50epochs.pth'))

<All keys matched successfully>

In [18]:
result_dict = {}
result_dict['ID']=[]
result_dict['label']=[]
fin_outputs=[]

model.eval()
with torch.no_grad():
    i=0
    for ids, texts in test_loader:
#         print(ids.item())
        result_dict['ID'].append(ids.item())
        texts = texts.to(device)
        predictions = model(texts).squeeze(1)
#         print(predictions.item())
        if predictions.item()>0.5:
            result_dict['label'].append(1)
        else:
            result_dict['label'].append(0)
        fin_outputs.extend(predictions.cpu().detach().numpy().tolist())
        i+=1

fin_outputs = np.array(fin_outputs) >= 0.5
fin_outputs = fin_outputs.astype(int)

In [19]:
len(result_dict['ID'])

5724

In [20]:
len(result_dict['label'])

5724

In [22]:
df = pd.DataFrame(result_dict)
csv_file = "submission.csv"

In [23]:
df.to_csv(csv_file, index=False)