In [1]:
import re
import os
import random
import tarfile
import urllib
from torchtext import data


class TarDataset(data.Dataset):

  

    @classmethod

    def download_or_unzip(cls, root):

        path = os.path.join(root, cls.dirname)

        if not os.path.isdir(path):

            tpath = os.path.join(root, cls.filename)

            if not os.path.isfile(tpath):

                print('downloading')

                urllib.request.urlretrieve(cls.url, tpath)

            with tarfile.open(tpath, 'r') as tfile:

                print('extracting')

                tfile.extractall(root)

        return os.path.join(path, '')





class MR(TarDataset):



    url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'

    filename = 'rt-polaritydata.tar'

    dirname = 'rt-polaritydata'



    @staticmethod

    def sort_key(ex):

        return len(ex.text)



    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):

        

        def clean_str(string):


            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)

            string = re.sub(r"\'s", " \'s", string)

            string = re.sub(r"\'ve", " \'ve", string)

            string = re.sub(r"n\'t", " n\'t", string)

            string = re.sub(r"\'re", " \'re", string)

            string = re.sub(r"\'d", " \'d", string)

            string = re.sub(r"\'ll", " \'ll", string)

            string = re.sub(r",", " , ", string)

            string = re.sub(r"!", " ! ", string)

            string = re.sub(r"\(", " \( ", string)

            string = re.sub(r"\)", " \) ", string)

            string = re.sub(r"\?", " \? ", string)

            string = re.sub(r"\s{2,}", " ", string)

            return string.strip()



        text_field.preprocessing = data.Pipeline(clean_str)

        fields = [('text', text_field), ('label', label_field)]



        if examples is None:

            path = self.dirname if path is None else path

            examples = []

            with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f:

                examples += [

                    data.Example.fromlist([line, 'negative'], fields) for line in f]

            with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f:

                examples += [

                    data.Example.fromlist([line, 'positive'], fields) for line in f]

        super(MR, self).__init__(examples, fields, **kwargs)



    @classmethod

    def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True, root='.', **kwargs):

        

        path = cls.download_or_unzip(root)

        examples = cls(text_field, label_field, path=path, **kwargs).examples

        if shuffle: random.shuffle(examples)

        dev_index = -1 * int(dev_ratio*len(examples))



        return (cls(text_field, label_field, examples=examples[:dev_index]),

                cls(text_field, label_field, examples=examples[dev_index:]))

In [2]:
import torch

import torch.nn as nn

import torch.nn.functional as F

from torch.autograd import Variable





class CNN_Text(nn.Module):

    

    def __init__(self, args):

        super(CNN_Text, self).__init__()

        self.args = args

        V = args.embed_num

        D = args.embed_dim

        C = args.class_num

        Ci = 1

        Co = args.kernel_num

        Ks = args.kernel_sizes
        

       
        self.embed = nn.Embedding(V, D)

        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])

        self.dropout = nn.Dropout(args.dropout)

        self.fc1 = nn.Linear(len(Ks)*Co*args.kmax, 256)
        
        self.fc2 = nn.Linear(256, C)
                
        
    def kmax_pooling(self, x, dim, k):
      
        index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
      
        return x.gather(dim, index)


    def forward(self, x):

        x = self.embed(x)  # (N, W, D)

        if self.args.static:

            x = Variable(x)

        x = x.unsqueeze(1)  # (N, Ci, W, D)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        
        x = [self.kmax_pooling(i, 2, args.kmax).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
       
        x = torch.cat(x, 1)
        
        x = x.view(x.size(0),-1)

        x = self.dropout(x)  # (N, len(Ks)*Co)

        x = self.fc1(x)  # (N, C)
        
        x = F.relu(x)
        
        x = self.dropout(x)  # (N, len(Ks)*Co)

        logit = self.fc2(x)

        return logit

In [3]:
import os

import sys

import torch

import torch.autograd as autograd

import torch.nn.functional as F





def train(train_iter, dev_iter, model, args):

#    if args.cuda:

 #       model.cuda()



    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)



    steps = 0

    best_acc = 0

    last_step = 0

    model.train()

    for epoch in range(1, args.epochs+1):

        for batch in train_iter:

            feature, target = batch.text, batch.label

            feature.data.t_(), target.data.sub_(1)  # batch first, index align

            

            feature, target = feature.cuda(), target.cuda()



            optimizer.zero_grad()

            logit = model(feature)



     
            loss = F.cross_entropy(logit, target)

            loss.backward()

            optimizer.step()



            steps += 1

            if steps % args.log_interval == 0:

                corrects = (torch.max(logit, 1)[1].view(target.size()).data == target.data).sum()

                accuracy = 100.0 * corrects/batch.batch_size

                sys.stdout.write(

                    '\rBatch[{}] - loss: {:.6f}  acc: {:.4f}%({}/{})'.format(steps, 

                                                                             loss.data[0], 

                                                                             accuracy,

                                                                             corrects,

                                                                             batch.batch_size))

            if steps % args.test_interval == 0:

                dev_acc = eval(dev_iter, model, args)

                if dev_acc > best_acc:

                    best_acc = dev_acc

                    last_step = steps

                    if args.save_best:

                        save(model, args.save_dir, 'best', steps)

                else:

                    if steps - last_step >= args.early_stop:

                        print('early stop by {} steps.'.format(args.early_stop))

            elif steps % args.save_interval == 0:

                save(model, args.save_dir, 'snapshot', steps)





def eval(data_iter, model, args):

    model.eval()

    corrects, avg_loss = 0, 0

    for batch in data_iter:

        feature, target = batch.text, batch.label

        feature.data.t_(), target.data.sub_(1)  # batch first, index align

        feature, target = feature.cuda(), target.cuda()



        logit = model(feature)

        loss = F.cross_entropy(logit, target, size_average=False)



        avg_loss += loss.data[0]

        corrects += (torch.max(logit, 1)

                     [1].view(target.size()).data == target.data).sum()



    size = len(data_iter.dataset)

    avg_loss /= size

    accuracy = 100.0 * corrects/size

    print('\nEvaluation - loss: {:.6f}  acc: {:.4f}%({}/{}) \n'.format(avg_loss, 

                                                                       accuracy, 

                                                                       corrects, 

                                                                       size))

    return accuracy





def predict(text, model, text_field, label_feild, cuda_flag):

    assert isinstance(text, str)

    model.eval()


    text = text_field.preprocess(text)

    text = [[text_field.vocab.stoi[x] for x in text]]

    x = text_field.tensor_type(text)

    x = autograd.Variable(x, volatile=True)

    if cuda_flag:

        x = x.cuda()

    print(x)

    output = model(x)

    _, predicted = torch.max(output, 1)

    return label_feild.vocab.itos[predicted.data[0][0]+1]





def save(model, save_dir, save_prefix, steps):

    if not os.path.isdir(save_dir):

        os.makedirs(save_dir)

    save_prefix = os.path.join(save_dir, save_prefix)

    save_path = '{}_steps_{}.pt'.format(save_prefix, steps)

    torch.save(model.state_dict(), save_path)

In [21]:
import os

import argparse

import datetime

import torch

import torchtext.data as data

import torchtext.datasets as datasets

import torchtext


class args():
    lr=0.001
    epochs=200
    batch_size=128
    log_interval=1
    test_interval=100
    save_interval=500
    save_dir='snapshot'
    early_stop=1000
    save_best=True
    shuffle=False
    dropout=0.6
    max_norm=3.0
    embed_dim=256
    kernel_num=150
    kernel_sizes='1,2,3,4,5'
    static=False
    device=1
    no_cuda=False
    snapshot=None
    predict=None
    test=False
    kmax=3
    gru_l=1
    g_out=100

# load MR dataset

def mr(text_field, label_field, **kargs):

    train_data, dev_data = MR.splits(text_field, label_field)

    text_field.build_vocab(train_data, dev_data)

    label_field.build_vocab(train_data, dev_data)
    
    #text_field.build_vocab(train_data, dev_data ,torchtext.vocab.GloVe(name='6B',dim=200))

    #label_field.build_vocab(train_data, dev_data ,torchtext.vocab.GloVe(name='6B',dim=200))

    train_iter, dev_iter = data.Iterator.splits(

                                (train_data, dev_data), 

                                batch_sizes=(args.batch_size, len(dev_data)),

                                **kargs)

    return train_iter, dev_iter





# load data

print("\nLoading data...")

text_field = data.Field(lower=True)

label_field = data.Field(sequential=False)

train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False)






# update args and print

args.embed_num = len(text_field.vocab)

args.class_num = len(label_field.vocab) - 1


args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]

args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))



print("\nParameters:")

for attr, value in sorted(args.__dict__.items()):

    print("\t{}={}".format(attr.upper(), value))





# model

cnn = CNN_Text(args).cuda()

#init = text_field.vocab

#cnn.embed.weight.data=init.vectors


if args.snapshot is not None:

    print('\nLoading model from {}...'.format(args.snapshot))

    cnn.load_state_dict(torch.load(args.snapshot))



# train or predict

if args.predict is not None:

    label = train.predict(args.predict, cnn, text_field, label_field, args.cuda)

    print('\n[Text]  {}\n[Label] {}\n'.format(args.predict, label))

elif args.test:

    try:

        eval(test_iter, cnn, args) 

    except Exception as e:

        print("\nSorry. The test dataset doesn't  exist.\n")

else:

    print()

    try:

        train(train_iter, dev_iter, cnn, args)

    except KeyboardInterrupt:

        print('\n' + '-' * 89)

        print('Exiting from training early')


Loading data...

Parameters:
	__DICT__=<attribute '__dict__' of 'args' objects>
	__DOC__=None
	__MODULE__=__main__
	__WEAKREF__=<attribute '__weakref__' of 'args' objects>
	BATCH_SIZE=128
	CLASS_NUM=2
	DEVICE=1
	DROPOUT=0.6
	EARLY_STOP=1000
	EMBED_DIM=256
	EMBED_NUM=19792
	EPOCHS=200
	G_OUT=100
	GRU_L=1
	KERNEL_NUM=150
	KERNEL_SIZES=[1, 2, 3, 4, 5]
	KMAX=3
	LOG_INTERVAL=1
	LR=0.001
	MAX_NORM=3.0
	NO_CUDA=False
	PREDICT=None
	SAVE_BEST=True
	SAVE_DIR=snapshot/2018-03-06_13-31-30
	SAVE_INTERVAL=500
	SHUFFLE=False
	SNAPSHOT=None
	STATIC=False
	TEST=False
	TEST_INTERVAL=100

Batch[71] - loss: 0.693117  acc: 57.0312%(73/128)



Batch[100] - loss: 0.672384  acc: 53.9062%(69/128)
Evaluation - loss: 0.674918  acc: 56.9755%(535/939) 





Batch[200] - loss: 0.579256  acc: 70.3125%(90/128)
Evaluation - loss: 0.621235  acc: 64.2173%(603/939) 

Batch[300] - loss: 0.267434  acc: 88.2812%(113/128)
Evaluation - loss: 0.649011  acc: 73.0564%(686/939) 

Batch[400] - loss: 0.054941  acc: 99.2188%(127/128))
Evaluation - loss: 0.734123  acc: 76.4643%(718/939) 

Batch[500] - loss: 0.005936  acc: 100.0000%(128/128)
Evaluation - loss: 0.842968  acc: 75.0799%(705/939) 

Batch[600] - loss: 0.001910  acc: 100.0000%(128/128)
Evaluation - loss: 0.902478  acc: 75.6124%(710/939) 

Batch[700] - loss: 0.001319  acc: 100.0000%(128/128)
Evaluation - loss: 0.956990  acc: 75.9318%(713/939) 

Batch[800] - loss: 0.000965  acc: 100.0000%(128/128)
Evaluation - loss: 1.023293  acc: 75.9318%(713/939) 

Batch[900] - loss: 0.000477  acc: 100.0000%(128/128)
Evaluation - loss: 1.109879  acc: 76.4643%(718/939) 

Batch[1000] - loss: 0.000143  acc: 100.0000%(128/128)
Evaluation - loss: 1.201274  acc: 76.2513%(716/939) 

Batch[1100] - loss: 0.000167  acc: 100.