In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['imdb_master.csv']


In [2]:
import nltk
from collections import Counter
import itertools

In [3]:
import torch

Рассмотрим преобразование текстов в удобоваримый для нейронной сети вид.<br>
А именно:<br>
    - Текст разбивается на слова (токенизация, знаки препинания считаются словами)<br>
    - Слова подсчитываются для формирования ограниченного словаря. Каждому слову сопоставляется определеннный номер в словаре. 
    Редким словам назначается специальный номер (эквивалентно замене редких слов на спец. слово <UNK> (неизвестное слово)). 
    - Последовательности слов преобразуются в последовательности номеров слов. Добавляются спец. слова для обозначения начала и конца текста. 
    - Полученные последовательности выравниваются по заданной максимальной длине через обрезание или дополнение номером спец.символа <PAD>


Класс для хранения текста в виде последовательности номеров и его закодированной метки

In [4]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, label_id):
        self.input_ids = input_ids
        self.label_id = label_id

Класс словаря. Метод word2id возвращает номер слова, id2word - наоборот, восстанавливает слово.

In [5]:
class Vocab:
    def __init__(self, itos, unk_index):
        self._itos = itos
        self._stoi = {word:i for i, word in enumerate(itos)}
        self._unk_index = unk_index
        
    def __len__(self):
        return len(self._itos)
    
    def word2id(self, word):
        idx = self._stoi.get(word)
        if idx is not None:
            return idx
        return self._unk_index
    
    def id2word(self, idx):
        return self._itos[idx]

In [6]:
from tqdm import tqdm_notebook

Интерфейс объекта, преобразующего тексты в последовательности номеров.
transform выполняет преобразование при помощи словаря.
fit_transform выучивает словарь из текста и возвращает такое же преобразование при помощи свежеполученного словаря.

In [7]:
class TextToIdsTransformer:
    def transform():
        raise NotImplementedError()
        
    def fit_transform():
        raise NotImplementedError()



Простая реализация данного интерфейса. Разбиение на слова производится с помощью библиотеки NLTK.
В словаре содержатся несколько спец. слов.
После токенизации, к полученной последовательности слов добавляются слева и справа спец. слова для начала и конца текста.

In [8]:
class SimpleTextTransformer(TextToIdsTransformer):
    def __init__(self, max_vocab_size):
        self.special_words = ['<PAD>', '</UNK>', '<S>', '</S>']
        self.unk_index = 1
        self.pad_index = 0
        self.vocab = None
        self.max_vocab_size = max_vocab_size
        
    def tokenize(self, text):
        return nltk.tokenize.word_tokenize(text.lower())
        
    def build_vocab(self, tokens):
        itos = []
        itos.extend(self.special_words)
        
        token_counts = Counter(tokens)
        for word, _ in token_counts.most_common(self.max_vocab_size - len(self.special_words)):
            itos.append(word)
            
        self.vocab = Vocab(itos, self.unk_index)
    
    def transform(self, texts):
        result = []
        for text in texts:
            tokens = ['<S>'] + self.tokenize(text) + ['</S>']
            ids = [self.vocab.word2id(token) for token in tokens]
            result.append(ids)
        return result
    
    def fit_transform(self, texts):
        result = []
        tokenized_texts = [self.tokenize(text) for text in texts]
        self.build_vocab(itertools.chain(*tokenized_texts))
        for tokens in tokenized_texts:
            tokens = ['<S>'] + tokens + ['</S>']
            ids = [self.vocab.word2id(token) for token in tokens]
            result.append(ids)
        return result

Строим экземпляр входных данных. Обеспечиваем длину последовательности номеров равной max_seq_len. 

In [9]:
def build_features(token_ids, label, max_seq_len, pad_index, label_encoding):
    if len(token_ids) >= max_seq_len:
        ids = token_ids[:max_seq_len]
    else:
        ids = token_ids + [pad_index for _ in range(max_seq_len - len(token_ids))]
    return InputFeatures(ids, label_encoding[label])
        

Собираем экземпляры в тензоры

In [10]:
def features_to_tensor(list_of_features):
    text_tensor = torch.tensor([example.input_ids for example in list_of_features], dtype=torch.long)
    labels_tensor = torch.tensor([example.label_id for example in list_of_features], dtype=torch.long)
    return text_tensor, labels_tensor

In [11]:
from sklearn import model_selection

In [12]:
imdb_df = pd.read_csv('../input/imdb_master.csv', encoding='latin-1')
dev_df = imdb_df[(imdb_df.type == 'train') & (imdb_df.label != 'unsup')]
test_df = imdb_df[(imdb_df.type == 'test')]
train_df, val_df = model_selection.train_test_split(dev_df, test_size=0.05, stratify=dev_df.label)

In [13]:
max_seq_len=200
classes = {'neg': 0, 'pos' : 1}

In [14]:
text2id = SimpleTextTransformer(10000)

train_ids = text2id.fit_transform(train_df['review'])
val_ids = text2id.transform(val_df['review'])
test_ids = text2id.transform(test_df['review'])

In [15]:
print(train_df.review.iloc[0][:160])
print(train_ids[0][:30])

I won't mention any of the plot, because, although it would be highly predictable anyway, the one notable plot twist is given away everywhere, in the movie comm
[2, 18, 513, 33, 748, 117, 9, 4, 136, 5, 104, 5, 269, 16, 70, 41, 542, 723, 575, 5, 4, 42, 2754, 136, 998, 11, 354, 261, 2629, 5]


In [16]:
train_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(train_ids, train_df['label'])]

val_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(val_ids, val_df['label'])]

test_features = [build_features(token_ids, label,max_seq_len, text2id.pad_index, classes) 
                  for token_ids, label in zip(test_ids, test_df['label'])]

In [17]:
train_tensor, train_labels = features_to_tensor(train_features)
val_tensor, val_labels = features_to_tensor(val_features)
test_tensor, test_labels = features_to_tensor(test_features)

In [18]:
print(train_tensor.size())
print(len(text2id.vocab))

torch.Size([23750, 200])
10000


In [19]:
from torch.utils.data import TensorDataset,DataLoader
train_loader = DataLoader(TensorDataset(train_tensor,train_labels),64)
val_loader = DataLoader(TensorDataset(val_tensor,val_labels),64)
test_loader = DataLoader(TensorDataset(test_tensor,test_labels),64)

In [20]:
for xx,yy in train_loader:
    print(xx)
    print(yy)
    break

tensor([[   2,   18,  513,  ...,    0,    0,    0],
        [   2,    4,  202,  ...,    0,    0,    0],
        [   2,    7,   19,  ..., 9160,    9,  545],
        ...,
        [   2,   66,   18,  ...,   11,  996,  102],
        [   2,   19,  244,  ...,  733,  848,    4],
        [   2, 8180, 5109,  ...,    0,    0,    0]])
tensor([0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
        1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
        0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0])


In [114]:
import torch.nn.functional as F
import torch.nn as nn
class intel(nn.Module):
    def __init__(self):
        super(intel, self).__init__()
        self.channel = 100
        self.embedded = nn.Embedding(10000,100)
        self.conv1 = nn.Conv1d(100, self.channel, 3)
        self.pool1 = nn.MaxPool1d(1750)
        self.norm = nn.BatchNorm1d(self.channel)
        
        self.classifier1 = nn.Linear(self.channel, 1)
        
    def forward(self,x):
        x = self.embedded(x)
        x = x.transpose(2,1)
        
        #x = self.norm(x)
        a = self.conv1(x)
        a = self.pool1(a)
        a = a.relu()
    
        e = a.view(-1, self.channel) #225*64
        e = self.classifier1(e)
        e = e.sigmoid()
        return e
    def convweight(self):
        return self.conv1.weight

In [115]:
torch.manual_seed(1488)
model = intel()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0025)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(device)

cuda:0


In [116]:
def train(model,train_ds,val_ds,optimizer, epochs, tolerance):
    running_tolerance = tolerance
    val_loss_best = 555
    criterion = nn.BCELoss()
    for i in range(epochs):
        model.train()
        epoch_loss = 0
        val_loss = 0
        for xx,yy in train_ds:
            xx, yy = xx.cuda(), yy.cuda()
            batchsize = xx.size(0)
            optimizer.zero_grad()
            y = model.forward(xx).view(-1)
            yy = yy.float().view(-1)
            loss = criterion(y,yy)
            epoch_loss += loss
            loss.backward()
            optimizer.step()
        epoch_loss /= len(train_loader)
        with torch.no_grad():
            model.eval()
            for xx,yy in val_ds:
                xx, yy = xx.cuda(), yy.cuda()
                batchsize = xx.size(0)
                y = model.forward(xx).view(-1)
                yy = yy.float()
                loss = criterion(y,yy)
                val_loss += loss
            val_loss /= len(val_loader)
            status = "epoch=%d, loss=%f, val_loss=%f, best_loss=%f" % (i,epoch_loss,val_loss,val_loss_best)
            print(status)
            if val_loss<val_loss_best:
                torch.save(model.state_dict(), "../best_model.md")
                val_loss_best = val_loss
                running_tolerance = tolerance
            else:
                running_tolerance -=1
                if running_tolerance==0:
                    print("Stop training")   
                    break
                print("Running tolerance is ", str(running_tolerance), "best is ",str(val_loss_best))
            
    model.load_state_dict(torch.load("../best_model.md"))    
    model.eval()
    model.cpu()


In [117]:
a = model.convweight()
print(a)

Parameter containing:
tensor([[[-0.0523,  0.0548, -0.0291],
         [-0.0404, -0.0102, -0.0328],
         [ 0.0242,  0.0216, -0.0276],
         ...,
         [ 0.0167, -0.0573,  0.0436],
         [ 0.0571,  0.0190,  0.0017],
         [ 0.0238, -0.0290,  0.0169]],

        [[-0.0458,  0.0513,  0.0389],
         [ 0.0116, -0.0003, -0.0275],
         [-0.0516, -0.0180,  0.0542],
         ...,
         [-0.0459, -0.0187,  0.0119],
         [-0.0074, -0.0068,  0.0219],
         [ 0.0215, -0.0362, -0.0192]],

        [[ 0.0218, -0.0429, -0.0443],
         [-0.0507,  0.0438, -0.0022],
         [ 0.0569,  0.0416,  0.0398],
         ...,
         [-0.0116, -0.0199, -0.0502],
         [ 0.0196,  0.0075, -0.0160],
         [ 0.0120,  0.0514, -0.0421]],

        ...,

        [[-0.0061,  0.0454, -0.0533],
         [ 0.0382, -0.0172,  0.0371],
         [ 0.0430,  0.0574, -0.0027],
         ...,
         [-0.0004, -0.0110, -0.0112],
         [ 0.0298, -0.0568, -0.0572],
         [-0.0160,  0.0010, 

In [118]:
train(model,train_loader,val_loader,optimizer,10,tolerance=5)

epoch=0, loss=0.497908, val_loss=0.411067, best_loss=555.000000
epoch=1, loss=0.329436, val_loss=0.453545, best_loss=0.411067
Running tolerance is  4 best is  tensor(0.4111, device='cuda:0')
epoch=2, loss=0.221745, val_loss=0.604020, best_loss=0.411067
Running tolerance is  3 best is  tensor(0.4111, device='cuda:0')
epoch=3, loss=0.170913, val_loss=0.362664, best_loss=0.411067
epoch=4, loss=0.101298, val_loss=0.417206, best_loss=0.362664
Running tolerance is  4 best is  tensor(0.3627, device='cuda:0')
epoch=5, loss=0.059894, val_loss=0.428038, best_loss=0.362664
Running tolerance is  3 best is  tensor(0.3627, device='cuda:0')
epoch=6, loss=0.031902, val_loss=0.634125, best_loss=0.362664
Running tolerance is  2 best is  tensor(0.3627, device='cuda:0')
epoch=7, loss=0.013461, val_loss=0.857701, best_loss=0.362664
Running tolerance is  1 best is  tensor(0.3627, device='cuda:0')
epoch=8, loss=0.005415, val_loss=0.479222, best_loss=0.362664
Stop training


In [119]:
a = model.convweight()
print(a)

Parameter containing:
tensor([[[ 4.1411e-02,  4.9257e-02, -1.0855e-01],
         [ 1.4669e-01, -8.9354e-02, -7.4858e-02],
         [-3.9532e-03,  6.1721e-02,  5.2131e-02],
         ...,
         [-1.5316e-01,  2.1955e-02,  1.2843e-01],
         [ 1.8534e-01,  9.7612e-02, -2.2576e-01],
         [-2.8751e-02, -1.4800e-01,  6.4884e-02]],

        [[-2.0434e-01, -1.1605e-02,  8.3507e-02],
         [ 6.9467e-02, -8.8488e-03, -1.5448e-01],
         [-1.4281e-03, -1.6088e-01,  2.3736e-01],
         ...,
         [-1.9737e-01,  7.8212e-02,  7.0181e-03],
         [-8.8084e-02, -4.4708e-02,  1.0634e-01],
         [-7.9531e-02, -4.2765e-02, -1.4516e-01]],

        [[ 2.7406e-02,  3.7796e-02, -1.2133e-01],
         [-7.3388e-02, -1.1821e-01, -7.0801e-02],
         [ 2.4078e-01,  5.2167e-02,  1.0919e-01],
         ...,
         [-6.9856e-02, -4.6379e-02, -3.2054e-01],
         [ 2.2162e-01,  1.5694e-01, -3.5315e-02],
         [ 7.6887e-03,  5.4617e-02,  2.9532e-02]],

        ...,

        [[-5.196

In [120]:
from sklearn.metrics import classification_report

model.eval()
all_preds = []
correct_preds = []
for xx,yy in test_loader:
    xx = xx.cuda()
    model.cuda()
    y_pred = model.forward(xx)
    all_preds.extend([i[0]>0.5 for i in y_pred.tolist()])
    correct_preds.extend(yy.tolist())
print(classification_report(correct_preds,all_preds))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85     12500
           1       0.86      0.83      0.84     12500

   micro avg       0.85      0.85      0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

