In [64]:
import easydict
import pickle
from sklearn.feature_extraction import FeatureHasher
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

import numpy as np
import time
import copy
import argparse

In [34]:
with open("data.pickle", 'rb') as fr: # open file
    data = pickle.load(fr, encoding='cp949')

In [14]:
def make_dict(tokens):
    doc = dict()
    
    for token in tokens:
        if token in doc:
            doc[token] += 1
        else:
            doc[token] = 1
    
    return doc

In [20]:
dic_test_x = [make_dict(token) for token in data['test_X']]
dic_train_x = [make_dict(token) for token in data['train_X']]

In [24]:
VOCAB_SIZE = min(len(dic_test_x),10000000)

In [58]:
Hasher = FeatureHasher(n_features=VOCAB_SIZE).fit(dic_train_x) 

In [43]:
class TrainDataset(Dataset):
    
    def __init__(self):

        self.H = Hasher.transform(dic_train_x)
        self.y_data = torch.from_numpy(np.array(data['train_Y']))
        
    def __getitem__(self,index):
        return torch.from_numpy(self.H[index].toarray()[0]), self.y_data[index]
    
    def __len__(self):
        return self.H.shape[0]

In [44]:
class TestDataset(Dataset):
    
    def __init__(self):

        self.H = Hasher.transform(dic_test_x)
        self.y_data = torch.from_numpy(np.array(data['test_Y'])).type(torch.long)
        
    def __getitem__(self,index):
        return torch.from_numpy(self.H[index].toarray()[0]), self.y_data[index]
    
    def __len__(self):
        return self.H.shape[0]  

In [45]:
def split_Data(dataset,val_split,batch_size):

    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(val_split * dataset_size))
    random_seed = 42
    np.random.seed(random_seed)
    np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=2, sampler = train_sampler)
    valid_loader = DataLoader(dataset=dataset, batch_size=batch_size, num_workers=2, sampler = valid_sampler)

    dataloaders= {'train':train_loader,'val':valid_loader}

    return dataloaders

In [48]:
class TextClassifier(nn.Module):
    def __init__(self,vocab_size,hidden_dim,num_class):
        super(TextClassifier,self).__init__()
        #self.embedding = nn.EmbeddingBag(vocab_size, hidden_dim, sparse=True)
        self.input_layer = nn.Linear(vocab_size,hidden_dim)
        self.hidden_layer = nn.Linear(hidden_dim, num_class)
        self.output_layer = nn.Softmax(dim=1)
        
    def forward(self,text):
        embedded = self.input_layer(text)
        output = self.hidden_layer(embedded)
        return self.output_layer(output)


In [49]:
def train_model(model, criterion, optimizer, scheduler, num_epochs):
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0 

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        for phase in ['train','val']:
            if phase == 'train':
                
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0
            count = 0

            for inputs,labels in dataloaders[phase]:
                count += len(inputs)
                inputs,labels = inputs.to(device),labels.to(device)
                inputs,labels = Variable(inputs.float()), Variable(labels)
                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs.float())
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    
                    if phase =='train':
                        loss.backward()
                        optimizer.step()
                        
                
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                            
            epoch_loss = running_loss / count
            epoch_acc = running_corrects.double() / count
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            
            #scheduler.step()

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                
        time_elapsed = time.time() - since
        print('Training Epoch complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print()

    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model

In [54]:
args = easydict.EasyDict({
        "hidden_dim": 10,
        "num_epoch": 20,
        "learning_rate": 1e-2,
        "batch_size": 32,
        "momentum": 0.9,
        "val_split": 0.2
})

In [56]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

HIDDEN_DIM = args.hidden_dim
NUM_CLASS = len(set(data['test_Y']))
NUM_EPOCH = args.num_epoch
LEARN_RATE = args.learning_rate
BATCH_SIZE = args.batch_size
MOMENTUM = args.momentum
VAL_SPLIT = args.val_split

In [67]:
dataloaders= split_Data(TrainDataset(),VAL_SPLIT,BATCH_SIZE)

model = TextClassifier(VOCAB_SIZE,HIDDEN_DIM,NUM_CLASS).to(device)
criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARN_RATE, momentum = MOMENTUM)
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

best_model = train_model(model, criterion, optimizer, scheduler, NUM_EPOCH)

Epoch 1/20
----------


BrokenPipeError: [Errno 32] Broken pipe