In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils import data
import numpy as np
import re
from tensorboardX import SummaryWriter

In [2]:
import glob
import unicodedata
import tqdm
from sklearn.model_selection import train_test_split
import gensim
from time import strftime, gmtime
import copy

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
neg_list = glob.glob('txt_sentoken/neg/*.txt')
pos_list = glob.glob('txt_sentoken/pos/*.txt')

In [5]:
pos_sentences = []
neg_sentences = []

In [6]:
for pos_txt in pos_list:
    with open(pos_txt, 'r', encoding='utf-8') as f:
        pos_sentences += f.readlines()

In [7]:
for neg_txt in neg_list:
    with open(neg_txt, 'r', encoding='utf-8') as f:
        neg_sentences += f.readlines()

In [8]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [9]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [10]:
def clean_txt(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

In [11]:
def preprocess(sentences):
    output = []
    for s in sentences:
        s = unicodeToAscii(s)
        #s = normalizeString(s)
        s = clean_txt(s)
        output.append(s)
    return output

In [12]:
pos_sentences = preprocess(pos_sentences)
neg_sentences = preprocess(neg_sentences)

In [13]:
def create_pairs(sentences, positive=True):
    pairs = []
    for s in sentences:
        if positive:
            pairs.append((s, 1))
        else:
            pairs.append((s, 0))
    return pairs

In [14]:
pos_pairs = create_pairs(pos_sentences, True)
neg_pairs = create_pairs(neg_sentences, False)

In [15]:
whole_pairs = pos_pairs + neg_pairs

In [16]:
train_split, test_split = train_test_split(whole_pairs, test_size=0.2)

In [17]:
def sen2tensor(sentence, dictionary, eval=False, fixed_len=70):
    sen = sentence.split(' ')
    output = []
    if len(sen) < fixed_len:
        while len(sen) < fixed_len:
            sen.append('<PAD>')
    else:
        sen = sen[:fixed_len]

    for word in sen:
        if eval and word not in dictionary.word2ix:
            output.append(dictionary.word2ix['<UNK>'])
        else:
            output.append(dictionary.word2ix[word])
    output = torch.LongTensor(output)
    
    return output

In [18]:
class Dataset(data.Dataset):
    
    def __init__(self, pairs, dictionary, fixed_length=70):
        #self.pos_sen = pos_pair
        #self.neg_sen = neg_pair
        self.pairs = pairs
        self.dictionary = dictionary
        #self.dataset = pos_pair + neg_pair
        #self.pos_len = len(pos_pair)
        #self.neg_len = len(neg_pair)
        self.fixed_len = fixed_length
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, ix):
        out = sen2tensor(self.pairs[ix][0], self.dictionary, fixed_len=self.fixed_len)
        if self.pairs[ix][1] == 1:
            label = torch.tensor(1)
        else:
            label = torch.tensor(0)
        
        return out, label

In [19]:
class Dict(object):
    def __init__(self, sentences):
        self.word2ix = {'<PAD>':0, '<UNK>':1}
        self.ix2word = {0 : '<PAD>', 1 : '<UNK>'}
        self.n_words = 2
    
        for sentence in sentences:
            sentence = sentence[0].split(' ')
            for word in sentence:
                if word not in self.word2ix:
                    self.word2ix[word] = self.n_words
                    self.ix2word[self.n_words] = word
                    self.n_words += 1
                else:
                     continue
        print(self.n_words, 'counted')

In [20]:
dictionary = Dict(whole_pairs)

40696 counted


In [21]:
trainset = Dataset(train_split, dictionary)
testset = Dataset(test_split, dictionary)

In [22]:
def read_pretrained_embed(path, dictionary):
    word2vec = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    
    vector = []
    for word in dictionary.word2ix:
        if word in word2vec.vocab:
            vector.append(word2vec[word])
        else:
            vector.append(np.random.uniform(-0.01, 0.01, 300))
    return np.array(vector)

In [23]:
pretrained = read_pretrained_embed('GoogleNews-vectors-negative300.bin', dictionary)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [24]:
class Convnet(nn.Module):
    
    def __init__(self, dictionary, embed_dim, kernel_sizes, vector=None, pretrained=True, num_features=100, dropout=0.5, output_size=2):
        super(Convnet, self).__init__()
        self.embedding = nn.Embedding(dictionary.n_words, embed_dim)
        if pretrained:
            self.embedding.weight.data.copy_(torch.from_numpy(vector))
        
        self.kernel_sizes = kernel_sizes
        self.num_features = num_features
        self.dropout = dropout
                
        for size in kernel_sizes:
            setattr(self, 'conv_'+str(size), nn.Conv1d(1, num_features, embed_dim * size, stride=embed_dim))
        
        self.linear1 = nn.Linear(len(kernel_sizes) * num_features, 128)
        self.linear2 = nn.Linear(128, output_size)
        
        
    def forward(self, x):
        batch_size, seq_len = x.shape
        
        x = self.embedding(x).view(batch_size, 1, -1)
                   
        conv_outputs = [F.max_pool1d(F.relu(getattr(self, 'conv_'+str(filter_size))(x)), seq_len - filter_size +1).view(-1, self.num_features) 
                        for filter_size in self.kernel_sizes]
        
        out = torch.cat(conv_outputs, 1)
        out = F.dropout(out, self.dropout)
        out = F.dropout(F.relu(self.linear1(out)), self.dropout)
        out = self.linear2(out)
        
        return out

In [37]:
net = Convnet(dictionary, 300, [3,4,5], pretrained, True).to(device)

In [38]:
net.modules

<bound method Module.modules of Convnet(
  (embedding): Embedding(40696, 300)
  (conv_3): Conv1d(1, 100, kernel_size=(900,), stride=(300,))
  (conv_4): Conv1d(1, 100, kernel_size=(1200,), stride=(300,))
  (conv_5): Conv1d(1, 100, kernel_size=(1500,), stride=(300,))
  (linear1): Linear(in_features=300, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=2, bias=True)
)>

In [39]:
trainloader = data.DataLoader(trainset, batch_size=50, shuffle=True)
testloader = data.DataLoader(testset, batch_size=50, shuffle=False)
criterion = nn.CrossEntropyLoss()
#criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(params=net.parameters(), lr=1e-1)

In [44]:
def train():
    
    writer = SummaryWriter(log_dir='log/')# + strftime('%H:%M:%S', gmtime()))
    
    epoch_loss = 0.0
    for epoch in tqdm.tnrange(10):
        net.train()
        
        acc = 0.0
        total = 0.0
        training_loss = 0.0
        max_test_acc = 0.0
        
        for i, (data, target) in enumerate(tqdm.tqdm_notebook(trainloader)):
            
            data, target = data.to(device), target.to(device)#.unsqueeze(1).float()
            output = net(data)
            loss = criterion(output, target)
            training_loss += loss.item()
            epoch_loss += loss.item()
            
            _, max = output.max(dim=1)
            acc += (max == target).sum().item()
            total += output.shape[0]
            
            
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), max_norm=3.0)
            optimizer.step()
            
            if (i+1) % 1000 == 0:
                print('epoch %d | step %d | loss %0.4f | train accuracy %0.2f %%' %(epoch+1, i+1, training_loss/1000, 100*acc/total))
                training_loss = 0.0
        
        test_loss, test_acc = test()
        writer.add_scalar('loss/train', epoch_loss/total, epoch)
        writer.add_scalar('acc/train', acc/total, epoch)
        writer.add_scalar('loss/test', test_loss, epoch)
        writer.add_scalar('acc/test', test_acc, epoch)
        
        if test_acc > max_test_acc:
            max_test_acc = test_acc
            best_model = copy.deepcopy(net)
            best_epoch = epoch+1
    writer.close()
    
    torch.save(best_model, 'best_model_epoch_{}.pt'.format(best_epoch))

In [45]:
def test():
    
    correct = 0.0
    total = 0.0
    test_loss = 0.0
    
    with torch.no_grad():
        for i, (data, label) in enumerate(testloader):
            data, label = data.to(device), label.to(device)#.unsqueeze(1)

            out = net(data)
            loss = criterion(out, label)
            test_loss += loss.item()
            topk, topi = out.max(dim=1)
            correct += (label == topi).sum().item()
            total += out.shape[0]
    
    print('Test accuracy : %0.2f %%' %(100 * correct / total))
    
    return test_loss / total, correct / total

In [46]:
train()

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 1 | step 1000 | loss 0.5622 | train accuracy 70.87 %
Test accuracy : 64.35 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 2 | step 1000 | loss 0.5305 | train accuracy 73.49 %
Test accuracy : 65.35 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 3 | step 1000 | loss 0.4958 | train accuracy 75.81 %
Test accuracy : 64.92 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 4 | step 1000 | loss 0.4606 | train accuracy 77.97 %
Test accuracy : 63.64 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 5 | step 1000 | loss 0.4278 | train accuracy 80.17 %
Test accuracy : 64.76 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 6 | step 1000 | loss 0.3941 | train accuracy 81.81 %
Test accuracy : 65.43 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 7 | step 1000 | loss 0.3566 | train accuracy 83.94 %
Test accuracy : 64.97 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 8 | step 1000 | loss 0.3355 | train accuracy 85.19 %
Test accuracy : 65.66 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 9 | step 1000 | loss 0.3067 | train accuracy 86.76 %
Test accuracy : 64.75 %


HBox(children=(IntProgress(value=0, max=1036), HTML(value='')))

epoch 10 | step 1000 | loss 0.2857 | train accuracy 87.51 %
Test accuracy : 64.12 %


In [270]:
def evaluate(sentence, dictionary):
    
    data = sen2tensor(sentence, dictionary, eval=True).unsqueeze(0).to(device)
    output = net(data)
    print(torch.softmax(output, 1))