# Main model run

### Basic setups

In [1]:
!pip install transformers
!pip install spacy
!python -m spacy download en_core_web_md
!pip install annoy
!pip install sentence_transformers
!pip install evaluate

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
# vizualization library
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

# pytorch library
import torch # the main pytorch library
import torch.nn.functional as f # the sub-library containing different functions for manipulating with tensors

# huggingface's transformers library
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

from annoy import AnnoyIndex

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import pandas as pd
import zipfile
from urllib.request import urlretrieve

from nltk.tokenize import word_tokenize
import string
import tqdm
import pickle
import gc

In [3]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string

import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
from transformers import T5ForConditionalGeneration,T5Tokenizer
import sentencepiece

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from transformers import  AdamW

#to avoid warnings
import warnings
warnings.filterwarnings('ignore')


In [6]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)


### Download data


In [10]:
# Loading the zip file and extracting a zip object
with zipfile.ZipFile("main_model_train.zip", 'r') as zip_file:
    zip_file.extract("main_model_train/main_model_train.csv")

# with zipfile.ZipFile("word_embeddings.zip", 'r') as zip_file:
#     zip_file.extract_all("word_embeddings/word_embeddings.pkl")

In [11]:
# Read files
train_df = pd.read_csv("main_model_train/main_model_train.csv", index_col=0)
eval_df = pd.read_csv("main_model_eval.csv", index_col=0)
test_df = pd.read_csv("main_model_test.csv", index_col=0)

train_df

Unnamed: 0,input_text,target_text
0,"I knew mushrooms, and I don't give a shit. """,I didn't know and I don't much care.”
1,Keep on lying like that. You're liable to get ...,you can keep lying like that and you'll be abl...
2,I was probably worth killing then.,I probably stood up for the assassination then.
3,It's bad enough we scared the shit out of them...,we already scared them off with the T-shirt wi...
4,"wake up, you corpses!",Heave and wake the dead!
...,...,...
462216,"Look, suck it, dude.","look, bite it, man."
462217,And if I have to look at that stupid picture o...,and if I have to take another look at the phot...
462218,I miss working with her because she's so godda...,I miss working with her because she does damn ...
462219,"Wow, this place is fuckin' incredible.","wow, this is incredible."


In [12]:
# # Load words embeddings
# with open('word_embeddings.pkl', "rb") as fIn:
#     word_embeddings = pickle.load(fIn)

# words = word_embeddings['words']
# embeddings = word_embeddings['embedding']

# # Free memory
# word_embeddings = None
# gc.collect()

## Synonyms finding

By embedding of the words, get_synonym will return top k best synonyms (by embeddings cosine similarity). Synonyms are finded by AnnoyIndex that is trained on most frequent words embeddings

In [None]:
def get_vector_index(embeddings, start=0):
    # Initialize index
    annoy3 = AnnoyIndex(embeddings.shape[1], 'angular')
    i = start
    for embedding in tqdm.tqdm(embeddings):
        try:
            # Add non-zero embeddings in the index
            # Because points with zero embedding
            # is given to "unknown" words and phrases
            if np.sum(np.abs(embedding)) != 0:
                annoy3.add_item(i, embedding)
        except:
            pass
        i += 1

    # Build 37 trees
    annoy3.build(37)
    print("Index is constructed")
    annoy3.save('annoy_index.ann')

    # Return resulting index
    return annoy3


def get_kNN_embeddings(embedding, k, index):
    # Obtain nearest neighbours
    return index.get_nns_by_vector(embedding, k)

In [None]:
def preprocess(sent):
    """
    Preprocess name of the dataset point
    Lowercased without punctuation and stop word
    Return list of preprocessed words from the sent
    """
    res = []

    try:
        words = word_tokenize(sent)
    except:
        print(f"\nTokenization fails for {sent}")
        return []

    for word in words:
        # Delete punctuation
        sent = sent.translate(str.maketrans("", "", string.punctuation))
        # Split by a free space
        word = word.strip()
        # Lowercase text
        word = word.lower()

        # Ignore free space
        if len(word) > 0:
            res.append(word)

    # Return list of preprocessed words from the sent
    return res

In [None]:
model_similarity = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # multi-language model

def embed(text):
    global model_similarity
    return model_similarity.encode([text], convert_to_tensor=False)[0]


In [None]:
def get_synonyms(word, k=1000):
    global index
    ids = get_kNN_embeddings(embed(word), k, index)
    return words[ids]


## Paraphraser

In [21]:
paraphraser = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
paraphrase_tokenizer = T5Tokenizer.from_pretrained('t5-base')
paraphraser = paraphraser.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
def paraphrase_sent(input_sentence):
    max_len = 256
    global paraphrase_tokenizer, paraphraser, device

    text = "paraphrase: " + input_sentence + " "

    encoding = paraphrase_tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    outputs = paraphraser.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=256,
        top_k=120,
        top_p=0.98,
        early_stopping=True,
        num_return_sequences=2 # Number of sentences to return
    )

    generated_sentence = paraphrase_tokenizer.decode(outputs[0],skip_special_tokens=True,clean_up_tokenization_spaces=True)

    return generated_sentence

In [25]:
paraphrase_sent("I'll fucking hate you, stupid nigger")



"I'll fucking hate you, idiot nigger. I'll fucking hate you, I'll Fucking in hate you."

## Sentence Similarity

In [27]:
model_similar = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

Downloading (…)c49cd/.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)fc6f7c49cd/README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

Downloading (…)6f7c49cd/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading (…)f7c49cd/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [28]:
def sentence_similarity(sent1, sent2):
    global model_similar
    sentences = [
        sent1,
        sent2
    ]
    embedding = model_similar.encode(sentences, convert_to_tensor=False)

    cosine_scores = util.cos_sim(embedding, embedding)

    return cosine_scores[0][1].item()


# Example
sentence_similarity("I'll fucking hate you", "I'll really hate you")

0.9447494745254517

## Toxicity score

In [13]:
def make_set(df):
    texts = []
    labels = []
    texts += list(df['input_text'])
    labels += list(np.ones(len(df['input_text'])))
    texts += list(df['target_text'])
    labels += list(np.zeros(len(df['target_text'])))
    res = pd.DataFrame({"texts": texts, "labels": labels})
    res = res.sample(frac=1).reset_index(drop=True)
    return res["texts"].to_numpy(), res["labels"].to_numpy()

X_train, y_train = make_set(train_df[:150000])
X_eval, y_eval = make_set(eval_df[:20000])
X_test, y_test = make_set(test_df[:20000])

In [54]:
# test_df.shape[0] * 2 ==  X_test.shape[0]

True

In [14]:
train_df = None
eval_df = None
test_df = None
gc.collect()

0

In [15]:
# Token and Encode Function
def tokenize_and_encode(tokenizer, comments, labels, max_length=128):
	# Initialize empty lists to store tokenized inputs and attention masks
	input_ids = []
	attention_masks = []

	# Iterate through each comment in the 'comments' list
	for comment in comments:

		# Tokenize and encode the comment using the BERT tokenizer
		encoded_dict = tokenizer.encode_plus(
			comment,

			# Add special tokens like [CLS] and [SEP]
			add_special_tokens=True,

			# Truncate or pad the comment to 'max_length'
			max_length=max_length,

			# Pad the comment to 'max_length' with zeros if needed
			pad_to_max_length=True,

			# Return attention mask to mask padded tokens
			return_attention_mask=True,

			# Return PyTorch tensors
			return_tensors='pt'
		)

		# Append the tokenized input and attention mask to their respective lists
		input_ids.append(encoded_dict['input_ids'])
		attention_masks.append(encoded_dict['attention_mask'])

	# Concatenate the tokenized inputs and attention masks into tensors
	input_ids = torch.cat(input_ids, dim=0)
	attention_masks = torch.cat(attention_masks, dim=0)

	# Convert the labels to a PyTorch tensor with the data type float32
	labels = torch.tensor(labels, dtype=torch.float32)

	# Return the tokenized inputs, attention masks, and labels as PyTorch tensors
	return input_ids, attention_masks, labels


In [16]:
# Token Initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

# Model Initialization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=1)
model = model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Tokenize and Encode the comments and labels for the training set
input_ids, attention_masks, labels = tokenize_and_encode(
    tokenizer,
    X_train,
    y_train
)

# Tokenize and Encode the comments and labels for the test set
test_input_ids, test_attention_masks, test_labels = tokenize_and_encode(
    tokenizer,
    X_test,
    y_test
)

# Tokenize and Encode the comments and labels for the validation set
val_input_ids, val_attention_masks, val_labels = tokenize_and_encode(
    tokenizer,
    X_eval,
    y_eval
)


print('Training Comments :',X_train.shape)
print('Input Ids         :',input_ids.shape)
print('Attention Mask    :',attention_masks.shape)
print('Labels            :',labels.shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training Comments : (300000,)
Input Ids         : torch.Size([300000, 128])
Attention Mask    : torch.Size([300000, 128])
Labels            : torch.Size([300000])


In [18]:
X_train, y_train = None, None
X_eval, y_eval =  None, None
X_test, y_test =  None, None
gc.collect()

0

In [19]:

from torch.utils.data import DataLoader, TensorDataset

# Creating DataLoader for the balanced dataset
batch_size = 16
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# testing set
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# validation set
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


print('Batch Size :',train_loader.batch_size)
Batch =next(iter(train_loader))
print('Each Input ids shape :',Batch[0].shape)
print('Input ids :\n',Batch[0][0])
print('Corresponding Decoded text:\n',tokenizer.decode(Batch[0][0]))
print('Corresponding Attention Mask :\n',Batch[1][0])
print('Corresponding Label:',Batch[2][0])

Batch Size : 16
Each Input ids shape : torch.Size([16, 128])
Input ids :
 tensor([ 101, 3531, 2123, 1005, 1056, 5607, 2033, 1012,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])
Corresponding Decoded text:
 [CLS] please don't shoot me. [SEP] [PAD]

In [20]:
input_ids, attention_masks, labels = None, None, None
test_input_ids, test_attention_masks, test_labels = None, None, None
val_input_ids, val_attention_masks, val_labels = None, None, None
gc.collect()

0

In [21]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [22]:
# Function to Train the Model
def train_model(model, train_loader, optimizer, device, num_epochs):
	# Loop through the specified number of epochs
	for epoch in range(num_epochs):
		# Set the model to training mode
		model.train()
		# Initialize total loss for the current epoch
		total_loss = 0

		# Loop through the batches in the training data
		for batch in tqdm.tqdm(train_loader):
			input_ids, attention_mask, labels = [t.to(device) for t in batch]

			optimizer.zero_grad()

			outputs = model(
				input_ids, attention_mask=attention_mask, labels=labels)
			loss = outputs.loss
			total_loss += loss.item()

			loss.backward()
			optimizer.step()

		model.eval() # Set the model to evaluation mode
		val_loss = 0

		# Disable gradient computation during validation
		with torch.no_grad():
			for batch in val_loader:
				input_ids, attention_mask, labels = [
					t.to(device) for t in batch]

				outputs = model(
					input_ids, attention_mask=attention_mask, labels=labels)
				loss = outputs.loss
				val_loss += loss.item()
		# Print the average loss for the current epoch
		print(
			f'Epoch {epoch+1}, Training Loss: {total_loss/len(train_loader)},Validation loss:{val_loss/len(val_loader)}')

# Evaluate the Model
def evaluate_model(model, test_loader, device):
	model.eval() # Set the model to evaluation mode

	true_labels = []
	predicted_probs = []

	with torch.no_grad():
		for batch in tqdm.tqdm(test_loader):
			input_ids, attention_mask, labels = [t.to(device) for t in batch]

			# Get model's predictions
			outputs = model(input_ids, attention_mask=attention_mask)
			# Use sigmoid for multilabel classification
			predicted_probs_batch = torch.sigmoid(outputs.logits)
			predicted_probs.append(predicted_probs_batch.cpu().numpy())

			true_labels_batch = labels.cpu().numpy()
			true_labels.append(true_labels_batch)

	# Combine predictions and labels for evaluation
	true_labels = np.concatenate(true_labels, axis=0)
	predicted_probs = np.concatenate(predicted_probs, axis=0)
	predicted_labels = (predicted_probs > 0.5).astype(
		int) # Apply threshold for binary classification

	# Calculate evaluation metrics
	accuracy = accuracy_score(true_labels, predicted_labels)
	precision = precision_score(true_labels, predicted_labels, average='micro')
	recall = recall_score(true_labels, predicted_labels, average='micro')

	# Print the evaluation metrics
	print(f'Accuracy: {accuracy:.4f}')
	print(f'Precision: {precision:.4f}')
	print(f'Recall: {recall:.4f}')


In [None]:
# Call the function to train the model
train_model(model, train_loader, optimizer, device, num_epochs=1)


In [None]:
# Call the function to evaluate the model on the test data
evaluate_model(model, test_loader, device)


In [None]:
# Save the tokenizer and model in the same directory
output_dir = "Saved_model"
# Save model's state dictionary and configuration
model.save_pretrained(output_dir)
# Save tokenizer's configuration and vocabulary
tokenizer.save_pretrained(output_dir)

In [None]:
def toxisity(text):
    global model, tokenizer, device
    user_input = [text]

    user_encodings = tokenizer(
        user_input, truncation=True, padding=True, return_tensors="pt")

    user_dataset = TensorDataset(
        user_encodings['input_ids'], user_encodings['attention_mask'])

    user_loader = DataLoader(user_dataset, batch_size=1, shuffle=False)

    model.eval()
    with torch.no_grad():
        for batch in user_loader:
            input_ids, attention_mask = [t.to(device) for t in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.sigmoid(logits)

    predicted_labels = (predictions.cpu().numpy() > 0.5).astype(int)
    return predicted_labels[0]

# Example
print(toxisity("fucking"))

### Iterate sentences

In [None]:
def process_sentence(sent, max_synonyms=3):
    words = preprocess(sent)
    result = []
    for ind, word in enumerate(words):
        if toxisity(word) <= 0.5:
            result.append(word)
            continue

        best_synonym = ""
        best_simil_score = -1
        # May be add analyzing skipping !!!
        counter = 0
        for synonym in get_synonyms(word):
            toxic = toxisity(synonym)
            if toxic <= 0.5:
                # Create a sentence as initial
                potential_sentence = words.copy()
                # And replace toxic word with a synonym
                potential_sentence[ind] = synonym
                potential_similarity = sentence_similarity(sent, " ".join(potential_sentence))

                # Better synonym in the context
                if potential_similarity > best_simil_score:
                    best_simil_score = potential_similarity
                    best_synonym = synonym
                    print(f"Better synonym for {word} is {synonym}")

                # Non-toxic word was analyzed
                counter += 1

            if counter >= max_synonyms:
                # Analyze only top max_synonyms
                break
        result.append(best_synonym)
    print("Before paraphrasing:", " ".join(result))
    # Paraphrase
    return paraphrase_sent(" ".join(result))



### Test first results

In [None]:
arrr_s = ["oh shut up, goddamn", "I'll fucking hate you, stupid idiot", test_df["input_text"].iloc[2] ]
for i in arrr_s:
    print(process_sentence(i))
    print()

### Evaluation

In [None]:
!pip install evaluate

In [None]:
from evaluate import load
bertscore = load("bertscore")

In [None]:
final_score = 0
for index, row in test_df[:10000]:
    print("input text: ", row['input_text'])
    print("Target text: ", row['target_text'])
    pred = process_sentence(row['input_text'])
    bertscore.compute(predictions=[pred], references=[row['target_text']], lang="en")
    print("Predictions: ", pred)