In [None]:
import pandas as pd 
import numpy as np
import os
import sys 
import torch
import transformers
import nltk
import random
from tqdm import tqdm
# import spacy
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
def iterative_paraphrasing(data, sentiments, num_beam=2, iterations=2, batch_size=8):
    from transformers import PegasusForConditionalGeneration, PegasusTokenizer
    
    model_name = 'tuner007/pegasus_paraphrase'
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
    
    num_beams = num_beam
    final_data = []
    final_sentiments = []
    num = 0
    num_sentence = len(data)
    num_batches = int(np.ceil(num_sentence/batch_size))
    
    for i in tqdm(range(num_batches)):
        batch_ = data[i*batch_size: (i+1)*batch_size]
        sentiment_batch = sentiments[i*batch_size: (i+1)*batch_size]
        
        final_data.extend(batch_)
        final_sentiments.extend(sentiment_batch)
        
        input_text = batch_
        
        for num in range(iterations):
            batch = tokenizer(input_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
            translated = model.generate(**batch, num_beams=3, num_return_sequences=3, temperature=1.5)
            tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)      
            input_text = [x for num, x in enumerate(tgt_text) if ((num+1)%3==0)]
            if num > 0:
                final_data.extend(input_text)
                final_sentiments.extend(sentiment_batch)
            
        batch = tokenizer(batch_, truncation=True,padding='longest', return_tensors="pt").to(torch_device)
        translated = model.generate(**batch, num_beams=num_beams, num_return_sequences=num_beams, temperature=1.5)
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        
        for num in range(len(sentiment_batch)):
            final_sentiments.extend([sentiment_batch[num]]*num_beams)
        final_data.extend(tgt_text)
        
    return final_data, final_sentiments

In [None]:
def iterative_backtranslation(data, sentiments, iterations=3, batch_size=8):
    from transformers import MarianMTModel, MarianTokenizer
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    en_to_de_model_name = 'Helsinki-NLP/opus-mt-en-de'
    en_to_de_tokenizer = MarianTokenizer.from_pretrained(en_to_de_model_name)
    en_to_de_model = MarianMTModel.from_pretrained(en_to_de_model_name).to(device)
    
    de_to_en_model_name = 'Helsinki-NLP/opus-mt-de-en'
    de_to_en_tokenizer = MarianTokenizer.from_pretrained(de_to_en_model_name)
    de_to_en_model = MarianMTModel.from_pretrained(de_to_en_model_name).to(device)
    
    german_sentence = []
    final_data = []
    final_sentiments = []
    
    num_sentence = len(data)
    num_batches = int(np.ceil(num_sentence/batch_size))
    
    
    for i in tqdm(range(num_batches)):
        batch = data[i*batch_size: (i+1)*batch_size]
        sentiments_batch = sentiments[i*batch_size: (i+1)*batch_size]
        
        final_data.extend(batch)
        final_sentiments.extend(sentiments_batch)
        
        src_text = batch
        for num in range(iterations):
            
            german_translated = en_to_de_model.generate(**en_to_de_tokenizer(src_text, return_tensors="pt", padding=True, truncation=True).to(device))
            german_text = en_to_de_tokenizer.batch_decode(german_translated, skip_special_tokens=True)
            
            english_translated = de_to_en_model.generate(**de_to_en_tokenizer(german_text, return_tensors="pt", padding=True, truncation=True).to(device))
            english_text = de_to_en_tokenizer.batch_decode(english_translated, skip_special_tokens=True)
            
            src_text = english_text
        final_data.extend(english_text)
        final_sentiments.extend(sentiments_batch)
        
    
    return final_data, final_sentiments

In [None]:
def para_to_interative_back(
    data, 
    y,
    para_iteration=1,
    num_beam=2,
    back_trans_iter=1,
    batch_size=32,
):
    data, y = iterative_paraphrasing(data,y, num_beam, para_iteration, batch_size)
    data, y = iterative_backtranslation(data, y, back_trans_iter, batch_size)
    
    return data, y

In [None]:
def add_synonms(data, sentiments, num_words_replaced=2, sentence_repeat=2):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    final_data = []
    final_sentiments = []
    synonm_idx = [0, 5, 10, 15, 20]
    for x, sentiment in tqdm(zip(data, sentiments), total=len(sentiments)):
        tokenized_sentence = tokenizer.tokenize(x)
        final_data.append(tokenized_sentence)
        final_sentiments.append(sentiment)
        
        for rep in range(sentence_repeat):
            x = tokenized_sentence.copy()
            synonm_idx = np.random.randint(0, len(tokenized_sentence), num_words_replaced)
            for idx in synonm_idx:
                root_word = tokenized_sentence[idx]
                for i, syn in enumerate(wordnet.synsets(root_word)):
                    if i == 1:
                        break
                    for j, l in enumerate(syn.lemmas()):
                        if j == 1:
                            break
                        if l.name() != root_word:
                            x[idx] = l.name()
            if x != tokenized_sentence:
                final_data.append(x)
                final_sentiments.append(sentiment)
            
    final_data = [" ".join(x) for x in final_data]
    
    return final_data, final_sentiments

In [None]:
def adding_random_words(data, classes):
    words_per_class = {}
    all_data_tokens = []
    final_data = []
    final_classes = []
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    for x, class_  in tqdm(zip(data, classes), total=len(data)):
        tokens = tokenizer.tokenize(x)
        all_data_tokens.append(tokens)
        if class_ not in words_per_class:
            words_per_class[class_] = [set(), 0]
        words_per_class[class_][0].update(tokens)
        words_per_class[class_][1] += 1
    
    for class_ in words_per_class:
        words_per_class[class_][0] = list(words_per_class[class_][0])
        
    for x, class_ in tqdm(zip(all_data_tokens, classes), total=len(classes)):
        final_data.append(" ".join(x))
        final_classes.append(class_)
        
        adding_word_idx = random.randint(0, words_per_class[class_][1])
        insert_idx = random.randint(0, len(x))
        x.insert(insert_idx, words_per_class[class_][0][adding_word_idx])
        
        final_data.append(" ".join(x))
        final_classes.append(class_)
    
    return final_data, final_classes
        

In [None]:
def preprocessing(train_x_, train_y_, preprocessing_type, preprocessing_args):
    preprocessing_args[0] = train_x_
    preprocessing_args[1] = train_y
    train_x_, train_y_ = preprocessing_type(*preprocessing_args)
    return train_x_, train_y_

In [None]:
def training_and_testing(train_x, train_y, test_x, test_y, classifier, preprocessing_name):
    classifier.fit(train_x, train_y)
    prediction = classifier.predict(test_x)
    print(f"Preprocessing Type = {preprocessing_name} | Testing Accuracy = {accuracy_score(prediction, test_y)*100}%")
    

In [None]:
def vectorizer(data, labels, train=True):
    if train:
        cv.fit(data)
        lb.fit(labels)
    labels = lb.transform(labels)
    vectorized_reviews = cv.transform(data)
    return vectorized_reviews, labels

In [None]:
def data_extration(path):
    df = pd.read_csv(path).sample(frac=1).reset_index(drop=True)
    df.dropna(inplace=True)
#     reviews = df.iloc[:, 1].tolist()
#     target = df.iloc[:, 0].tolist()
    reviews = df["text"].apply(lambda x: x.lower()).tolist()
    target =  df["sentiment"].tolist()
    
    return reviews, target

In [None]:
path = "../input/tweet-sentiment-extraction/train.csv"

In [None]:
x, y = data_extration(path)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, stratify=y)

In [None]:
batch_size = 32
back_iterations = 3
para_iterations = 2

num_words_replaced = 2
sentence_repeat = 5

num_beams = 2


preprocessing_types = {
    "No Preprocessing": [None, None],
    "Adding Random Words": [adding_random_words, [None, None]],
    "Adding Synonms": [add_synonms, [None, None, num_words_replaced, sentence_repeat]],
    "Iterative Paraphrasing": [iterative_paraphrasing, [None, None, num_beams, para_iterations, batch_size]],
    "Iterative Backtranslation": [iterative_backtranslation, [None, None, back_iterations, batch_size]],
#     "Paraphrase and Back translation": [para_to_interative_back, [None, None, para_iterations, num_beams, back_iterations, batch_size]],
}

In [None]:
for preprocessing_name, (preprocessing_type, preprocessing_args) in preprocessing_types.items():
    
    if preprocessing_type is not None:
        train_x_, train_y_ = preprocessing(
            train_x, 
            train_y, 
            preprocessing_type,
            preprocessing_args
        )
        
    else:
        train_x_, train_y_ = train_x, train_y
        
    x = list(zip(train_x_, train_y_))
    random.shuffle(x)
    train_x_, train_y_ = zip(*x)
    
    cv = TfidfVectorizer()
    lb = LabelEncoder()
    train_x_vec, train_y_vec = vectorizer(train_x_, train_y_, train=True)
    test_x_vec, test_y_vec = vectorizer(test_x, test_y, train=False)
    classifier = LogisticRegression(solver='liblinear')
    classifier = RandomForestClassifier()
    training_and_testing(train_x_vec, train_y_vec, test_x_vec, test_y_vec, classifier, preprocessing_name)