In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('train.tsv', sep='\t')

df = df [:1021]

train, test = train_test_split(df, test_size=0.20)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import torch

vectorizer = CountVectorizer(binary=True)
text_data = train["Phrase"]
text_data = vectorizer.fit_transform(text_data)

text_tensor_train = torch.from_numpy(text_data.todense()).float()

text_tensor_train = text_tensor_train.unsqueeze(1)

#ajout du cas où on ne connait pas le mot
voca = vectorizer.get_feature_names() #+ list(['<unk>'])
#ajout d'un zéro à la fin de chaque tensor
# unk = torch.zeros([text_tensor_train.shape[0],1], dtype=torch.float)
# text_tensor_train = torch.cat((text_tensor_train, unk), 1).unsqueeze(1)

def labelToVec(label):
    # label_one_hot_vector = torch.tensor(pd.get_dummies(df["Sentiment"]).values)
    label_tensor = torch.tensor(label["Sentiment"].values)
    return label_tensor

label_tensor_train = labelToVec(train)

In [3]:
from nltk import word_tokenize
# nltk.download('punkt')

def vocaToUnk(sentence, voca):
    s = []
    for w in word_tokenize(sentence):
        if(w not in voca):
            w = '<unk>'
        s.append(w)
    return " ".join(s)         
                
text_test = test.Phrase.apply(lambda x : vocaToUnk(x, voca))
#Ne prend pas en compte <unk>
text_test = vectorizer.transform(text_test)

text_tensor_test = torch.from_numpy(text_test.todense()).float()
label_tensor_test = labelToVec(test)

text_tensor_test = text_tensor_test.unsqueeze(1)

print(text_tensor_test.shape, text_tensor_train.shape)

torch.Size([205, 1, 361]) torch.Size([816, 1, 361])


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(input_size, embedding_size)

        self.rnn = nn.RNN(input_size = input_size, hidden_size = hidden_size, batch_first = True)
        self.h2o = nn.Linear(hidden_size, output_size)       
        
    def forward(self, input, hidden):
        
        hidden = hidden.to(device)
        output, hidden = self.rnn(input, hidden)
        output = self.h2o(hidden)
        
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, dtype=torch.float)

In [10]:
input_dim = len(voca)
embedding_size = 100
hidden_dim = 100
output_dim = 5

rnn = RNN(input_dim, embedding_size, hidden_dim, output_dim)

import torch.optim as optim

optimizer = optim.SGD(rnn.parameters(), lr=1e-3)

def train(train_category, train_text, test_category, test_text, num_epoch, batch_size):
    
    train_category = train_category.to(device)
    train_text = train_text.to(device)
    test_category = test_category.to(device)
    test_text = test_text.to(device)
    
    size_train = train_text.size(0)
    size_test = test_text.size(0)
    
    hidden = rnn.initHidden(batch_size)   
    criterion = nn.CrossEntropyLoss()    
    rnn.zero_grad()
    
    for epoch in range(num_epoch):
        nb_batch_train = len(train_text) / batch_size
        nb_batch_test = len(test_text) / batch_size
        train_loss = 0
        train_acc = 0
        i = 0
        while (i + batch_size) <= size_train:
            tmp = i
            i += batch_size
            input = train_text[tmp:i]
            target = train_category[tmp:i]
            output, hidden = rnn(input, hidden)
            
            loss = criterion(output.squeeze(0), target)
            loss.backward(retain_graph=True)
            optimizer.step() 
            
            predicted = torch.argmax(output.data, dim=2)
            correct = (predicted == target).sum().item()
            
            train_loss += loss.item()
            train_acc += correct / batch_size
        
        i = 0
        test_loss = 0
        test_acc = 0        
        
        #j'aime faire des boucles presque pareilles
        while (i + batch_size) <= size_test:
            tmp = i
            i += batch_size
            input = test_text[tmp:i]
            target = test_category[tmp:i]
            
            output, _ = rnn(input, hidden)
            loss = criterion(output.squeeze(0), target)   
            
            predicted = torch.argmax(output.data, dim=2)
            correct = (predicted == target).sum().item()
            
            test_loss += loss.item()
            test_acc += correct / batch_size
                
        print(epoch, "loss :", train_loss / nb_batch_train, "/ acc :", train_acc / nb_batch_train)
        print("Test loss :", test_loss / nb_batch_test, "/ acc :", test_acc / nb_batch_test)

        
    print('Fini !')

In [11]:
batch_size = 50
nb_epoch = 10

train(label_tensor_train, text_tensor_train, label_tensor_test, text_tensor_test, nb_epoch, batch_size)

0 loss : 1.4904430903056087 / acc : 0.5710784313725489
Test loss : 1.4113151736375764 / acc : 0.6390243902439026
1 loss : 1.3114212701718013 / acc : 0.6495098039215685
Test loss : 1.171937076056876 / acc : 0.6390243902439026
2 loss : 1.0847953202969887 / acc : 0.6495098039215685
Test loss : 1.039134700123857 / acc : 0.6390243902439026
3 loss : 1.1092096019317121 / acc : 0.6495098039215685
Test loss : 1.1865225943123423 / acc : 0.6390243902439026
4 loss : 1.1097256678576564 / acc : 0.6495098039215685
Test loss : 0.9892009089632733 / acc : 0.6390243902439026
5 loss : 1.0165864674776208 / acc : 0.6519607843137255
Test loss : 1.0761971880749959 / acc : 0.6292682926829269
6 loss : 1.0967105919239568 / acc : 0.6580882352941176
Test loss : 1.0402085432192176 / acc : 0.6341463414634146
7 loss : 0.978087126186081 / acc : 0.6507352941176469
Test loss : 0.9752777291507256 / acc : 0.6390243902439026
8 loss : 0.9850570165059146 / acc : 0.6495098039215685
Test loss : 1.0633199534765105 / acc : 0.639