In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader,random_split,Dataset

In [2]:
df = pd.read_csv('train.csv');
df.head()

Unnamed: 0,id,review,sentiment
0,1861,A masterpiece! Highly recommend it.,1
1,354,The characters were flat and uninteresting.,0
2,1334,Awful visuals and horrible sound quality.,0
3,906,"What a fantastic experience, I enjoyed every m...",1
4,1290,I couldn't stop smiling while watching.,1


In [3]:
texts = df['review'].tolist();
Y = df['sentiment'].tolist();

In [8]:
len(texts)

1600

In [6]:
texts[344]

'Brilliant acting and a touching story.'

In [9]:
max(len(t.split()) for t in texts)

9

In [10]:
texts[3]

'What a fantastic experience, I enjoyed every moment.'

In [11]:
import re
texts = [re.sub(r"[^a-zA-Z\s\']",'',t).lower() for t in texts]
texts[3]

'what a fantastic experience i enjoyed every moment'

In [19]:
from collections import Counter
class vocab :
  def __init__(self, texts,stopwords = set(),min_freq = 5,min_length=2):
    sentences = [t.split() for t in texts];
    self.vocab = set();
    self.vocab.add('<unk>'); self.vocab.add('<pad>');
    self.word2idx = {}; self.idx2word = {};
    self.word2idx['<unk>'] =1;
    self.word2idx['<pad>'] =0;
    self.idx2word[0] = '<pad>';
    self.idx2word[1] = '<unk>';
    index = 2;
    cnt = Counter(word for sentence in sentences for word in sentence);
    sentences = [[word for word in sentence if cnt[word]>=min_freq and word not in stopwords and len(word)>min_length]
                 for sentence in sentences];
    for sentence in sentences :
      for word in sentence :
        if (word not in self.vocab):
          self.vocab.add(word);
          self.word2idx[word] = index;
          self.idx2word[index] = word;
          index+=1;
  def encode(self,text):
    return [self.word2idx[t] for t in text.split()];
  def decode(self, ids):
    return [self.idx2word[t] for t in ids];


In [20]:
v = vocab(texts,min_freq = 0,min_length=0);
len(v.vocab)

87

In [27]:
class Text(Dataset):
  def __init__(self, texts,labels):
    super().__init__();
    self.texts = texts;
    self.labels = labels;
  def __len__(self): return len(self.texts);
  def __getitem__(self, index):
    text = self.texts[index];
    label = torch.tensor(self.labels[index],dtype = torch.int64);
    encoded_text = torch.tensor(v.encode(text),dtype = torch.int64);
    return encoded_text,label

In [28]:
dataset = Text(texts,Y);
dataset[0]

(tensor([2, 3, 4, 5, 6]), tensor(1))

In [33]:
def padding(ids,max_length) :
  ids = ids.tolist();
  ids = ids + [0] * (max_length-len(ids));

  return ids

In [34]:
def collate_fn(batch):
  x,y = zip(*batch);
  mx = max(len(xi) for xi in x);
  x = torch.tensor([padding(xi,mx) for xi in x],dtype = torch.int64);
  return x, y

In [35]:
train_dataset,val_dataset = random_split(dataset,[1300,300]);
train_loader = DataLoader(train_dataset,batch_size=64,shuffle=True,collate_fn=collate_fn);
val_loader = DataLoader(val_dataset,batch_size=64,shuffle=False,collate_fn=collate_fn);

In [40]:
class RNN(nn.Module):
  def __init__(self, embedding_size,hidden_size,vocab_size):
    super().__init__();
    self.embedding = nn.Embedding(vocab_size,embedding_size,padding_idx=0);
    self.rnn = nn.LSTM(embedding_size,hidden_size,batch_first = True,num_layers = 4);
    self.fc = nn.Linear(hidden_size,2);
  def forward(self, x):
    embed = self.embedding(x);
    _,(hidden,cell) = self.rnn(embed); #(batch * num_layers....);
    f = self.fc(hidden); #(batch*2);
    return f;

In [41]:
model = RNN(embedding_size=64,hidden_size=128,vocab_size=len(v.word2idx));
model

RNN(
  (embedding): Embedding(87, 64, padding_idx=0)
  (rnn): LSTM(64, 128, num_layers=4, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [42]:
device = 'cuda';
model.to(device);

In [43]:
optimizer = torch.optim.Adam(lr = 2e-3,params = model.parameters());
loss_function = nn.CrossEntropyLoss(ignore_index = v.word2idx['<pad>']);

In [59]:
b = 0;
for i in range(50):
  model.train();
  for x, y in train_loader :
    x = x.to(device);
    y = torch.tensor(y,dtype = torch.int64).to(device);
    optimizer.zero_grad();
    out = model(x).permute(1,0,2);
    out = out.reshape(out.shape[0],-1)
    loss = loss_function(out,y);
    loss.backward();
    optimizer.step()
    b+=1;
  print(f"Epoch {i+1}: ")
  print("Loss:",round(loss.item(),10))

Epoch 1: 
Loss: 7.1525e-06
Epoch 2: 
Loss: 7.0333e-06
Epoch 3: 
Loss: 6.9141e-06
Epoch 4: 
Loss: 6.7949e-06
Epoch 5: 
Loss: 6.6757e-06
Epoch 6: 
Loss: 6.7591e-06
Epoch 7: 
Loss: 6.4264e-06
Epoch 8: 
Loss: 6.2982e-06
Epoch 9: 
Loss: 6.4903e-06
Epoch 10: 
Loss: 6.0797e-06
Epoch 11: 
Loss: 5.9604e-06
Epoch 12: 
Loss: 5.722e-06
Epoch 13: 
Loss: 5.6028e-06
Epoch 14: 
Loss: 5.5194e-06
Epoch 15: 
Loss: 5.4836e-06
Epoch 16: 
Loss: 5.3644e-06
Epoch 17: 
Loss: 5.2963e-06
Epoch 18: 
Loss: 5.2214e-06
Epoch 19: 
Loss: 5.126e-06
Epoch 20: 
Loss: 5.0366e-06
Epoch 21: 
Loss: 4.8967e-06
Epoch 22: 
Loss: 4.9008e-06
Epoch 23: 
Loss: 4.7684e-06
Epoch 24: 
Loss: 4.725e-06
Epoch 25: 
Loss: 4.5895e-06
Epoch 26: 
Loss: 4.5001e-06
Epoch 27: 
Loss: 4.2915e-06
Epoch 28: 
Loss: 4.2273e-06
Epoch 29: 
Loss: 4.1723e-06
Epoch 30: 
Loss: 4.0531e-06
Epoch 31: 
Loss: 4.0531e-06
Epoch 32: 
Loss: 3.9447e-06
Epoch 33: 
Loss: 3.9339e-06
Epoch 34: 
Loss: 3.8317e-06
Epoch 35: 
Loss: 3.8239e-06
Epoch 36: 
Loss: 3.7054e-06
Epoc

In [62]:
model.eval();
c = 0;
with torch.no_grad():
  for x,y in val_loader :
    x = x.to(device);
    y = torch.tensor(y,dtype = torch.int64).to(device);
    predict = model(x);
    predict = predict.permute(1,0,2);
    predict = predict.reshape(predict.shape[0],-1).argmax(1);
    c+=(predict==y).sum().item();
print(c)

152


torch.Size([64])

In [51]:
out.shape

torch.Size([4, 64, 2])

In [None]:
########## with transformers ###########

In [63]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer

In [64]:
model_name = 'bert-base-uncased';
tokenizer = AutoTokenizer.from_pretrained(model_name);

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [81]:
class Text2(Dataset):
  def __init__(self,texts,labels):
    super().__init__()
    self.texts = texts;
    self.labels = labels;
  def __len__(self):return len(self.texts);
  def __getitem__(self, index):
    text = self.texts[index];
    label = self.labels[index];
    tokens = tokenizer(text,truncation=True,padding='max_length',max_length=20,
                       return_tensors='pt');
    label = torch.tensor(label,dtype = torch.int64);
    item = {key:val.squeeze(0) for key,val in tokens.items()};
    item['labels'] = label
    return item

In [83]:
dataset = Text2(texts,Y);
dataset[0]

{'input_ids': tensor([  101,  1037, 17743,  3811, 16755,  2009,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor(1)}

In [84]:
model2 = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2);
model2

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [85]:
train_dataset,val_dataset = random_split(dataset,[1300,300]);
train_loader = DataLoader(train_dataset,batch_size=128,shuffle=True);
val_loader = DataLoader(val_dataset,batch_size=128,shuffle=False);

In [86]:
optimizer = torch.optim.Adam(lr=2e-5,params = model2.parameters());

In [89]:
model2.to(device);

In [90]:
for i in range(100):
  model2.train();
  for x in train_loader :
    optimizer.zero_grad()
    attention_mask = x['attention_mask'].to(device);
    labels = x['labels'].to(device);
    input_ids = x['input_ids'].to(device);
    out = model2(attention_mask = attention_mask,input_ids=input_ids,labels=labels);
    loss = out.loss;
    loss.backward();
    optimizer.step();
  if ((i+1)%5==0):
    print(f"Epoch {i+1} ---> Loss : {round(loss.item(),4)}");

Epoch 5 ---> Loss : 0.0053
Epoch 10 ---> Loss : 0.0017
Epoch 15 ---> Loss : 0.001
Epoch 20 ---> Loss : 0.0006
Epoch 25 ---> Loss : 0.0005
Epoch 30 ---> Loss : 0.0003
Epoch 35 ---> Loss : 0.0003
Epoch 40 ---> Loss : 0.0002
Epoch 45 ---> Loss : 0.0002
Epoch 50 ---> Loss : 0.0002
Epoch 55 ---> Loss : 0.0001
Epoch 60 ---> Loss : 0.0001
Epoch 65 ---> Loss : 0.0001
Epoch 70 ---> Loss : 0.0001
Epoch 75 ---> Loss : 0.0001
Epoch 80 ---> Loss : 0.0001
Epoch 85 ---> Loss : 0.0001
Epoch 90 ---> Loss : 0.0001
Epoch 95 ---> Loss : 0.0001
Epoch 100 ---> Loss : 0.0001


In [91]:
model.eval();
c = 0;
with torch.no_grad():
  for x in val_loader :
    attention_mask = x['attention_mask'].to(device);
    labels = x['labels'].to(device);
    input_ids = x['input_ids'].to(device);
    out = model2(attention_mask = attention_mask,input_ids=input_ids,labels=labels);
    logits = out.logits;
    predict = logits.argmax(1);
    c += (predict==labels).sum().item();
print(c);

300


In [92]:
test_df = pd.read_csv('test.csv');
test_df.head()

Unnamed: 0,id,review,usage
0,5000,Complete waste of time.,Private
1,5001,Disappointing from start to end.,Public
2,5002,Terribly boring and predictable.,Private
3,5003,Complete waste of time.,Public
4,5004,Complete waste of time.,Private


In [94]:
Id = test_df['id'].to_numpy();
test_texts = test_df['review'].tolist();
Y_test = [0 for _ in range(len(test_texts))];

In [95]:
test_dataset = Text2(test_texts,Y_test);
test_loader= DataLoader(test_dataset,batch_size=128,shuffle=False);

In [97]:
model2.eval();
predicts = []
c = 0;
with torch.no_grad():
  for x in test_loader :
    attention_mask = x['attention_mask'].to(device);
    labels = x['labels'].to(device);
    input_ids = x['input_ids'].to(device);
    out = model2(attention_mask = attention_mask,input_ids=input_ids,labels=labels);
    logits = out.logits;
    predict = logits.argmax(1);
    predicts.extend(predict.tolist())

In [98]:
predicts = np.array(predicts);
predicts

array([0, 0, 0, ..., 1, 0, 1])

In [99]:
sub = pd.DataFrame({'id':Id,'sentiment':predicts})
sub.head(10)

Unnamed: 0,id,sentiment
0,5000,0
1,5001,0
2,5002,0
3,5003,0
4,5004,0
5,5005,1
6,5006,1
7,5007,0
8,5008,0
9,5009,1


In [101]:
sub.to_csv('sub1.csv',index=False)