In [1]:
import pandas as pd

df = pd.read_csv('train.tsv', sep='\t')

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

df = df [:1000]
vectorizer = CountVectorizer(binary=True)
text_data = df["Phrase"]
text_data = vectorizer.fit_transform(text_data)

import torch

text_tensor = torch.from_numpy(text_data.todense()).float()

#ajout du cas où on ne connait pas le mot
voca = vectorizer.get_feature_names() + list(['<unk>'])
#ajout d'un zéro à la fin de chaque tensor
unk = torch.zeros([text_tensor.shape[0],1], dtype=torch.float)
text_tensor = torch.cat((text_tensor, unk), 1).unsqueeze(1)

# label_one_hot_vector = torch.tensor(pd.get_dummies(df["Sentiment"]).values)
label_tensor = torch.tensor(df["Sentiment"])

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, embedding_size)

        self.rnn = nn.RNN(input_size = input_size, hidden_size = hidden_size, batch_first = True)
        self.h2o = nn.Linear(hidden_size, output_size)       
        
    def forward(self, input, hidden):
        
        hidden = hidden.to(device)
        output, hidden = self.rnn(input, hidden)
        output = self.h2o(hidden)
        
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, dtype=torch.float)

In [5]:
input_dim = len(voca)
embedding_size = 100
hidden_dim = 100
output_dim = 5

rnn = RNN(input_dim, embedding_size, hidden_dim, output_dim)

import torch.optim as optim

optimizer = optim.SGD(rnn.parameters(), lr=1e-3)

def train(category_tensor, text_tensor, num_epoch, batch_size):
    
    category_tensor = category_tensor.to(device)
    text_tensor = text_tensor.to(device)
    
    hidden = rnn.initHidden(batch_size)   
    criterion = nn.CrossEntropyLoss()    
    rnn.zero_grad()
    
    for epoch in range(num_epoch):
        nb_batch = len(text_tensor) / batch_size
        epoch_loss = 0
        epoch_acc = 0
        i = 0
        while i < text_tensor.size(0):
            tmp = i
            i += batch_size
            input = text_tensor[tmp:i]
            target = category_tensor[tmp:i]
            
            output, hidden = rnn(input, hidden)
            
            loss = criterion(output.squeeze(0), target)
            loss.backward(retain_graph=True)
            optimizer.step() 
            
            predicted = torch.argmax(output.data, dim=2)
            correct = (predicted == target).sum().item()
            
            epoch_loss += loss.item()
            epoch_acc += correct / batch_size            
                
        print(epoch, "loss :", epoch_loss / nb_batch, "/ acc :", epoch_acc / nb_batch)
        
    print('Fini !')

In [6]:
batch_size = 50
nb_epoch = 10

train(label_tensor, text_tensor, nb_epoch, batch_size)

0 loss : 1.5639868259429932 / acc : 0.325
1 loss : 1.1988994508981705 / acc : 0.67
2 loss : 1.1519758701324463 / acc : 0.67
3 loss : 1.093334424495697 / acc : 0.67
4 loss : 1.1421825975179671 / acc : 0.6659999999999999
5 loss : 0.9999149113893508 / acc : 0.67
6 loss : 1.0983599483966828 / acc : 0.67
7 loss : 1.0409906804561615 / acc : 0.664
8 loss : 1.6780536353588105 / acc : 0.493
9 loss : 2.3030121266841888 / acc : 0.42800000000000005
Fini !


In [9]:
predicted = torch.tensor([1,2,3,4])
target = torch.tensor([1,1,3,4])
correct = (predicted == target).sum().item()
print(correct)

3
