### Install required libraries.

In [None]:
from IPython.display import clear_output

!pip3 install tensorflow_text
!pip3 install urllib3==1.25.4
!pip3 install transformers==2.8.0
!pip install --upgrade pip
!wget -N https://raw.githubusercontent.com/sberbank-ai/ru-gpts/master/generate_transformers.py

clear_output()

In [None]:
import pandas as pd

# !wget https://www.dropbox.com/s/ei5vw6cbd9fragp/dataset_200.xlsx

# data = pd.read_excel('dataset_200.xlsx')
# data.dropna(inplace=True)

import json
import numpy as np
from sklearn.model_selection import train_test_split

# delimiter = '>>>'

# sentences = data.apply(lambda x: x[0] + f' {delimiter} ' + x[1], axis=1)

### Arguments for generation

In [None]:
from generate_transformers import *

class Args:
    def __init__(self):
        self.model_type = 'gpt2'
        self.model_name_or_path = 'sberbank-ai/rugpt3large_based_on_gpt2'

        self.prompt = ''
        self.length = 50
        self.stop_token = '</s>'

        self.k = 5
        self.p = .95
        self.temperature = 1

        self.repetition_penalty = 1
        self.num_return_sequences = 1

        self.device='cuda'
        self.seed=42

### Useful functions

In [None]:
import tensorflow_text
import tensorflow_hub as hub

def generate_sequences(prompt_text, args, delimiter='>>>'):
    args.prompt_text = prompt_text

    
    if prompt_text.endswith('.txt'):
      with open(prompt_text, 'r') as f:
        prompt_text = f.read()

    # print(f'Input:\n{prompt_text}\n')
    
    encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
    encoded_prompt = encoded_prompt.to(args.device)

    output_sequences = model.generate(
        input_ids=encoded_prompt,
        max_length=args.length + len(encoded_prompt[0]),
        temperature=args.temperature,
        top_k=args.k,
        top_p=args.p,
        repetition_penalty=args.repetition_penalty,
        do_sample=True,
        num_return_sequences=args.num_return_sequences,
    )

    if len(output_sequences.shape) > 2:
            output_sequences.squeeze_()

    generated_sequences = []
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
        text = text[: text.find(args.stop_token) if args.stop_token else None]
        text = text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]

        if delimiter in text:
            text = text.split(delimiter)[0].rstrip()
        else:
            text = text.split('\n')[0].rstrip()

        generated_sequences.append(text)
        # print(f'[{generated_sequence_idx}]ruGPT:\n{prompt_text.split('\n')[-1] + text}')

    return generated_sequences


def compute_use(target_comment, generated_comments):
    target_comment = embed(list([target_comment]))
    generated_comments = list(map(embed, generated_comments))

    return [np.inner(target_comment, gc)[0][0] for gc in generated_comments]


def compare_results(source_comment, target_comment, generated_comments, scores):
    print(f'Toxic : {source_comment}')
    print(f'Polite: {target_comment}\n')

    print(f'Score  Generated Comment')
    for i in np.argsort(scores):
        print(np.round(scores[i], 3), generated_comments[i])
    return generated_comments[np.argsort(scores)[0]], np.argsort(scores)[0]


embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

06/16/2021 06:36:35 - INFO - absl -   Using /tmp/tfhub_modules to cache modules.
06/16/2021 06:36:35 - INFO - absl -   Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'.
06/16/2021 06:36:38 - INFO - absl -   Downloaded https://tfhub.dev/google/universal-sentence-encoder-multilingual/3, Total size: 266.88MB
06/16/2021 06:36:38 - INFO - absl -   Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'.


### Download weights & unzip 

In [None]:
!gdown --id 1RYUku5_MWXZF2xlIpOTZmi9_DH-SG0lz && mkdir rugpt3_large_200 && unzip rugpt3_large_200.zip -d rugpt3_large_200

Downloading...
From: https://drive.google.com/uc?id=1RYUku5_MWXZF2xlIpOTZmi9_DH-SG0lz
To: /content/rugpt3_large_200.zip
1.83GB [00:15, 118MB/s] 
Archive:  rugpt3_large_200.zip
 extracting: rugpt3_large_200/training_args.bin  
 extracting: rugpt3_large_200/config.json  
 extracting: rugpt3_large_200/tokenizer_config.json  
 extracting: rugpt3_large_200/merges.txt  
 extracting: rugpt3_large_200/pytorch_model.bin  
 extracting: rugpt3_large_200/special_tokens_map.json  
 extracting: rugpt3_large_200/vocab.json  
   creating: rugpt3_large_200/.ipynb_checkpoints/


### Initialize model

In [None]:
args = Args()
args.model_name_or_path = 'rugpt3_large_200'
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
model = model_class.from_pretrained(args.model_name_or_path)
model.to(args.device)
clear_output()

In [None]:
text = 'Ты дурак и ничего не понимаешь. Что значит по-твоему построить дорогу?'

In [None]:
from tqdm import tqdm 
import re

results = []

# parameters
args.num_return_sequences = 10
args.k = 3
args.p = .5
args.temperature = 10
# here text stands for your sentence
args.length = len(text) + 5


generated_sequences = generate_sequences(text + ' >>> ', args)
results.append([re.sub('<pad>', '', x) for x in generated_sequences])





In [None]:
print(results[0][2][:args.length])

 Вы глупец и ничего не понимаете. Что значит по-вашему построить дорогу?
