In [23]:
import torch
import os
import json
import time
from tqdm import tqdm_notebook as tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
with open("data_discr_ganre_arr_word2int","r") as f:
    dataset=json.load(f)
with open("gange","r") as f:
    gange = json.load(f)
with open("vocab","r") as f:
    vocab = json.load(f)

In [25]:
import torch.nn as nn
import torch.nn.functional as F
class TextGanre(nn.Module):
    def __init__(self,batch_size, vocab_size, embedding_dim, num_class, hidden_size=128):
        super().__init__()
        self.embedding_dim=embedding_dim
        self.batch_size = batch_size
        self.input_dim = vocab_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.basic_rnn = nn.LSTM(embedding_dim, hidden_size)
        self.dropout = nn.Dropout(0.3)
        
        self.clasifiter = nn.Sequential(
            nn.Linear(hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, num_class),
            nn.Sigmoid()
        )

        self.init_weights()
        
    def init_hidden(self):
        # (num_layers, batch_size, n_neurons)
        self.c_0 = torch.zeros(1,self.batch_size, self.hidden_size)
        self.h_0 = torch.zeros(1,self.batch_size, self.hidden_size)

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (self.h_0 , self.c_0) = self.basic_rnn(embedded, (self.h_0 , self.c_0))
        
        output = self.clasifiter(output)
        
        return output

In [26]:
EMBED_DIM = 16
NUN_CLASS = len(gange)
NGRAMS = 2
BATCH_SIZE = 16
VOCAB_SIZE=len(vocab)
N_EPOCHS =5 

model = TextGanre(BATCH_SIZE, VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [27]:
def generate_batch(batch):
    d=[b["ganre"] for b in batch]
    label = torch.tensor(d).to(torch.float)
    text = [torch.tensor(b["discr"]).to(torch.int64) for b in batch]
    
    text=torch.nn.utils.rnn.pad_sequence( text , batch_first=True)
    return text.view(-1,len(batch)), label

In [28]:
def acur(pred, result):

    result=result.tolist()
    s = 0
    c = 1
    
    for i,p in enumerate(pred[-1]):    
        p=p.tolist()
        count = sum(result[i])
        
        top = sorted(p)[-int(count):]
        for t in top:
            s+=result[i][p.index(t)]
        c+=count
            
    return s/c

In [32]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    
    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()        
        model.init_hidden() 
        
        text, cls = text.to(device), cls.to(device)
        try:
            output = model(text)

            loss = criterion(output, cls)
            train_loss += loss.detach().item()
            loss.backward()
            optimizer.step()

            train_acc += 1 - abs((output-cls).mean())

       
        except:
            print("Error")
            break

    # Adjust the learning rate
    scheduler.step()

    return train_loss / i, train_acc / i

In [33]:
from torch.utils.data.dataset import random_split

train_len = int(len(dataset) * 0.95)
sub_train_, sub_valid_ = random_split(dataset, [train_len, len(dataset) - train_len])

In [34]:
model

TextGanre(
  (embedding): Embedding(69127, 16)
  (basic_rnn): LSTM(16, 128)
  (dropout): Dropout(p=0.3, inplace=False)
  (clasifiter): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=26, bias=True)
    (3): Sigmoid()
  )
)

In [35]:
import torch.optim as optim

criterion = torch.nn.MultiLabelSoftMarginLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train_func(sub_train_)
#     valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60
    

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
#     print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Error
Epoch: 1  | time in 7 minutes, 20 seconds
	Loss: 0.6960(train)	|	Acc: 91.3%(train)
Error
Epoch: 2  | time in 6 minutes, 24 seconds
	Loss: 0.6931(train)	|	Acc: 91.5%(train)
Error
Epoch: 3  | time in 679 minutes, 50 seconds
	Loss: 0.6931(train)	|	Acc: 91.5%(train)
Error
Epoch: 4  | time in 5 minutes, 53 seconds
	Loss: 0.6931(train)	|	Acc: 91.5%(train)
Error
Epoch: 5  | time in 5 minutes, 58 seconds
	Loss: 0.6931(train)	|	Acc: 91.5%(train)


In [36]:
import re

class Porter:
	PERFECTIVEGROUND =  re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
	REFLEXIVE = re.compile(u"(с[яь])$")
	ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
	PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
	VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
	NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
	RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
	DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
	DER = re.compile(u"ость?$")
	SUPERLATIVE = re.compile(u"(ейше|ейш)$")
	I = re.compile(u"и$")
	P = re.compile(u"ь$")
	NN = re.compile(u"нн$")

	def stem(word):
		word = word.lower()
		word = word.replace(u'ё', u'е')
		m = re.match(Porter.RVRE, word)
		if m.groups():
			pre = m.group(1)
			rv = m.group(2)
			temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
			if temp == rv:
				rv = Porter.REFLEXIVE.sub('', rv, 1)
				temp = Porter.ADJECTIVE.sub('', rv, 1)
				if temp != rv:
					rv = temp
					rv = Porter.PARTICIPLE.sub('', rv, 1)
				else:
					temp = Porter.VERB.sub('', rv, 1)
					if temp == rv:
						rv = Porter.NOUN.sub('', rv, 1)
					else:
						rv = temp
			else:
				rv = temp
			
			rv = Porter.I.sub('', rv, 1)

			if re.match(Porter.DERIVATIONAL, rv):
				rv = Porter.DER.sub('', rv, 1)

			temp = Porter.P.sub('', rv, 1)
			if temp == rv:
				rv = Porter.SUPERLATIVE.sub('', rv, 1)
				rv = Porter.NN.sub(u'н', rv, 1)
			else:
				rv = temp
			word = pre+rv
		return word
	stem=staticmethod(stem)

In [37]:
def text_preproc(text):
    reg = re.compile('[^а-яА-Я ]')
    text = (reg.sub('', text))
    text=re.compile("\s+").sub(" ",text).lower()
    
    res=[]
    for w in text.split(" "):
        try:
            if len(w)>4:
                try:
                    res.append(vocab[Porter.stem(w)])
                except:
                    pass
            else:
                res.append(vocab[w])
        except:
            pass
    
    return res

In [38]:
text_preproc("Очень страшный фильм")

[16206, 62696, 23914]

In [39]:
def predict(text):
    text, _ = generate_batch([{
        "discr":text_preproc(text),
        "ganre":[0]
    }])
    return text

test = predict("Там, где круглый год лежат снега, а небо озаряет северное сияние, живет Ледяная принцесса Лилли с голубыми, как древний лед, волосами. Однажды Лилли и её друг, полярный медведь Лимбо, встречают юного дракона, который не умеет извергать пламя. Лилли обещает помочь ему. Но путь, который их ждет, будет не из легких, впереди новых друзей ждет немало приключений и опасных испытаний.")

In [51]:
newmodel.h_0 = torch.zeros(1,1, 128)
newmodel.c_0 = torch.zeros(1,1, 128)

pred=newmodel(test)
pred[0][-1][0]

tensor([-0.9580,  0.9682, -0.0804,  0.9751,  0.9522, -0.9875,  0.9740, -0.9696,
        -0.0681, -0.9734,  0.9715, -0.9622,  0.9677,  0.9236,  0.9750, -0.9862,
         0.9149, -0.9823, -0.9770,  0.2799, -0.9873, -0.9828,  0.9732, -0.9641,
         0.9777,  0.9688, -0.9783,  0.9482, -0.9466, -0.9739, -0.9594, -0.9714,
        -0.9650,  0.9774, -0.9713,  0.9609,  0.9692,  0.9677, -0.9722, -0.9776,
        -0.9711, -0.9756, -0.9769, -0.9545, -0.9815,  0.9394,  0.9857,  0.9662,
        -0.9363,  0.9793,  0.9779,  0.9407,  0.9431,  0.9775,  0.6681,  0.9719,
        -0.9808, -0.9688,  0.9632, -0.9782,  0.8912,  0.9567, -0.9486,  0.9992,
        -0.9813,  0.9238, -0.9760, -0.9483, -0.9815, -0.9691,  0.9688,  0.9533,
         0.9745,  0.9714, -0.9805, -0.9824,  0.9893,  0.9746,  0.9785, -0.9628,
        -0.9949,  0.9448, -0.9620, -0.9608, -0.9812,  0.9766,  0.9863, -0.9704,
         0.9885,  0.9213,  0.9692, -0.8247, -0.9814, -0.9842,  0.9234,  0.9443,
         0.9683,  0.9837, -0.9554,  0.98

In [21]:
top = sorted(pred)[-5:]
for t in top:
    print(gange[pred.index(t)])

Боевик


In [None]:
model.eval()

In [45]:
newmodel = torch.nn.Sequential(*(list(model.children())[:-2]))

In [46]:
torch.save(newmodel, "model_lstm_short")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
