# Data Augmentation with Back-translation method on Imbalanced Text Dataset for Korean Vishing Classification

In [None]:
import torch

# Check if CUDA is available
print("CUDA Available:", torch.cuda.is_available())

# If CUDA is available, print the current GPU details
if torch.cuda.is_available():
    print("Current GPU Index:", torch.cuda.current_device())
    print("Current GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))
    print("GPU Memory Allocated:", torch.cuda.memory_allocated())
    print("GPU Memory Cached:", torch.cuda.memory_reserved())

In [None]:
#importing necessary libraries
# General
from time import time
from timeit import default_timer as timer
import numpy as np
import pandas as pd
import re
import os
import io
import pickle
import sys
import subprocess
from collections import Counter

# EDA
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
from wordcloud import WordCloud
from collections import Counter

# NLP 
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize 

# ML
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, fbeta_score, roc_auc_score, matthews_corrcoef, cohen_kappa_score

# Data importation and Analysis

In [None]:
# import the train, validation and the test sets
print('Loading datasets...')
train_set= pd.read_csv('training_set.csv')
val_set = pd.read_csv('validation_set.csv')
test_set = pd.read_csv('test_set.csv')

# import all augmented dataset
train_set_ch = pd.read_csv('vishing_dataset_CH_AUG.csv')
train_set_en = pd.read_csv('vishing_dataset_EN_AUG.csv')
train_set_ja = pd.read_csv('vishing_dataset_JA_AUG.csv')

print('Datasets loaded.')

In [None]:
# Delete all the rows in train_set_en, train_set_ja and train_set_ch that have label 0
train_set_ch = train_set_ch[train_set_ch['label'] != 0]
train_set_en = train_set_en[train_set_en['label'] != 0]
train_set_ja = train_set_ja[train_set_ja['label'] != 0]

In [None]:
def plot_class_distribution(data, title):
    sns.set(style="whitegrid")
    # sns.set(style="ticks")
    ax = sns.countplot(x='label', data=data)
    ax.set_title(title)
    
    # Annotate the bars with the number of samples
    for p in ax.patches:
        ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='baseline', fontsize=11, color='black', xytext=(0, 5),
                    textcoords='offset points')
        
    plt.show()

In [None]:
#plot the distribution of the datasets
plot_class_distribution(train_set, 'Train Dataset Class Distribution')
plot_class_distribution(val_set, 'Validation Dataset Class Distribution')
plot_class_distribution(test_set, 'Test Dataset Class Distribution')

In [None]:
#plot the distribution of the datasets
plot_class_distribution(train_set_en, '(English Augmented) Train Dataset Class Distribution')
plot_class_distribution(train_set_ja, '(Japanese Augmented) Train Dataset Class Distribution')
plot_class_distribution(train_set_ch, '(Chinese Augmented) Train Dataset Class Distribution')

In [None]:
train_set.info()
val_set.info()
test_set.info()

In [None]:
train_set_en.info()
train_set_ja.info()
train_set_ch.info()

In [None]:
# drop the colum we don't need
train_set.drop(['id'], axis=1, inplace=True)
val_set.drop(['id'], axis=1, inplace=True)
test_set.drop(['id'], axis=1, inplace=True)

#check the dataframes
train_set.info()
val_set.info()
test_set.info()

In [None]:
# drop the colum we don't need
train_set_en.drop(['id', 'transcript', 'translation', 'processed'], axis=1, inplace=True)
train_set_ja.drop(['id', 'transcript', 'translation', 'processed'], axis=1, inplace=True)
train_set_ch.drop(['id', 'transcript', 'translation', 'processed'], axis=1, inplace=True)

#check the dataframes
train_set_en.info()
train_set_ja.info()
train_set_ch.info()

In [None]:
# rename the column back_translation of train_set_en, train_set_ja, train_set_ch to transcript_en, transcript_ja, transcript_ch
train_set_en.rename(columns={'back_translation':'transcript_en'}, inplace=True)
train_set_ja.rename(columns={'back_translation':'transcript_ja'}, inplace=True)
train_set_ch.rename(columns={'back_translation':'transcript_ch'}, inplace=True)

# display the info of the dataframes
train_set_en.info()
train_set_ja.info()
train_set_ch.info()

 ## Calculating the length of each data sample.

In [None]:
# calculate the length of each data sample in the train_set, val_set and test_set and add the length as a new column named length to the dataframes
train_set['length'] = train_set['transcript'].apply(lambda x: len(x))
val_set['length'] = val_set['transcript'].apply(lambda x: len(x))
test_set['length'] = test_set['transcript'].apply(lambda x: len(x))

# display the heads of the dataframes
train_set.head()

In [None]:
val_set.head()

In [None]:
test_set.head()

In [None]:
# calculate the length of each data samples in the train_set_en, train_set_ja and train_set_ch and add the length as a new column named length to the dataframes
train_set_en['length'] = train_set_en['transcript_en'].apply(lambda x: len(x))
train_set_ja['length'] = train_set_ja['transcript_ja'].apply(lambda x: len(x))
train_set_ch['length'] = train_set_ch['transcript_ch'].apply(lambda x: len(x))

# display the heads of the dataframes
train_set_en.head()

In [None]:
train_set_ja.head()

In [None]:
train_set_ch.head()

## Distribution based on length of words

In [None]:
# Make a function to plot the distribution of the length of the data samples in the train_set, val_set and test_set (boxplot and histogram)
def plot_length_distribution(data, title):
    sns.set(style="whitegrid")
    # sns.set(style="ticks")
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle(title)
    
    # plot the boxplot
    # sns.boxplot(x='length', data=data, ax=ax1)
    sns.boxplot(y='length', x='label', data=data, ax=ax1)
    ax1.set_title('Boxplot')
    
    # plot the histogram
    sns.histplot(x='length', data=data, ax=ax2)
    ax2.set_title('Histogram')
    
    plt.show()

In [None]:
# plot the distribution of the length of the data samples in the train_set, val_set and test_set
plot_length_distribution(train_set, 'Train Dataset Length Distribution')
plot_length_distribution(val_set, 'Validation Dataset Length Distribution')
plot_length_distribution(test_set, 'Test Dataset Length Distribution')

In [None]:
# plot the distribution of the length of the data samples in the train_set_en, train_set_ja and train_set_ch
plot_length_distribution(train_set_en, '(English Augmented) Train Dataset Length Distribution')
plot_length_distribution(train_set_ja, '(Japanese Augmented) Train Dataset Length Distribution')
plot_length_distribution(train_set_ch, '(Chinese Augmented) Train Dataset Length Distribution')

# Morphology Analyzer Installation

## Installation of Mecab-ko-for-GoogleColab

In [None]:
# Check if Mecab-ko-for-GoogleColab is installed. If not, install it.
try:
    from konlpy.tag import Mecab
except:
    print('Mecab-ko-for-GoogleColab is not installed. Installing...')
    subprocess.check_call(['bash', './install_mecab-ko_on_colab190912.sh'])
    print('Mecab-ko-for-GoogleColab installed.')


# if not os.path.exists('/content/Mecab-ko-for-Google-Colab'):
#     print('Installing Mecab-ko-for-Google-Colab...')
#     !git clone

## Testing MeCab

In [None]:
# Test MeCab and test morphological analysis on a sample sentence in Korean language (한국어)
mecab = Mecab()
print(mecab.morphs('한국어 형태소 분석기 테스트 중 입니다.'))

In [None]:
# test pos tagging
print(mecab.pos('한국어 형태소 분석기 테스트 중 입니다.'))

# Dataset Preprocessing

## Dataset cleaning and purification

In [None]:

# function to perform the cleaning parts
def apply_replacement(src_df, replace_func):
    ret_df = src_df
    ret_df['transcript'] = ret_df['transcript'].apply(lambda x: replace_func(x))
    return ret_df

# remove the unwanted word and characters from the dataset
def word_replace(x):
    example_word_replace_list = {'o/': '',
                                 'b/': '',
                                 'n/': '',
                                 '\n': ' ',
                                 'name': '',
                                 'laughing': '',
                                 'clearing': '',
                                 'singing': '',
                                 'applauding': ''}
    for i in example_word_replace_list:
        x = x.replace(i, example_word_replace_list[i])
    return x

# remove the special character from the transcripts
def remove_special_sysmbols(sentence): 
    sentence = re.sub(r"[-~=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]", '', sentence)
    return sentence

# remove x and O from the transcripts
def replace_x_o(sentence):
    
    # sentence = re.sub(r"(o{2,})|(O{2,})|(\ㅇ{2,})|(0{2,})|(x{2,})", '' , sentence)
    # sentence = re.sub(r"x{2,}", '' , sentence)
    # sentence = re.sub(r"0{2,}", '' , sentence)
    # sentence = re.sub(r"ㅇ{2,}", '' , sentence)
    # sentence = re.sub(r"O{2,}", '' , sentence)
    # sentence = re.sub(r"o{2,}", '' , sentence)
    sentence = re.sub(r"(o|O|\ㅇ|0|x){2,}", '' , sentence)
    return sentence

# remove the unwanted word and characters from the transcripts
def nline_replace(x):
    example_word_replace_list = {'\n' : ' '}
    for i in example_word_replace_list:
        x = x.replace(i, example_word_replace_list[i])
    return x

# remove extra whote space
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

In [None]:
# Clear the train_set using the function defined above
train_set['transcript_clean'] = train_set['transcript'].apply(lambda x: word_replace(x))
train_set['transcript_clean'] = train_set['transcript_clean'].apply(lambda x: remove_special_sysmbols(x))
train_set['transcript_clean'] = train_set['transcript_clean'].apply(lambda x: replace_x_o(x))
train_set['transcript_clean'] = train_set['transcript_clean'].apply(lambda x: nline_replace(x))
train_set['transcript_clean'] = train_set['transcript_clean'].apply(lambda x: remove_extra_white_spaces(x))
train_set['length_transcript_clean'] = train_set['transcript_clean'].apply(lambda x: len(x))

In [None]:
# Clear the val_set using the function defined above
val_set['transcript_clean'] = val_set['transcript'].apply(lambda x: word_replace(x))
val_set['transcript_clean'] = val_set['transcript_clean'].apply(lambda x: remove_special_sysmbols(x))
val_set['transcript_clean'] = val_set['transcript_clean'].apply(lambda x: replace_x_o(x))
val_set['transcript_clean'] = val_set['transcript_clean'].apply(lambda x: nline_replace(x))
val_set['transcript_clean'] = val_set['transcript_clean'].apply(lambda x: remove_extra_white_spaces(x))
val_set['length_transcript_clean'] = val_set['transcript_clean'].apply(lambda x: len(x))

In [None]:
# Clear the test_set using the function defined above
test_set['transcript_clean'] = test_set['transcript'].apply(lambda x: word_replace(x))
test_set['transcript_clean'] = test_set['transcript_clean'].apply(lambda x: remove_special_sysmbols(x))
test_set['transcript_clean'] = test_set['transcript_clean'].apply(lambda x: replace_x_o(x))
test_set['transcript_clean'] = test_set['transcript_clean'].apply(lambda x: nline_replace(x))
test_set['transcript_clean'] = test_set['transcript_clean'].apply(lambda x: remove_extra_white_spaces(x))
test_set['length_transcript_clean'] = test_set['transcript_clean'].apply(lambda x: len(x))

In [None]:
# Clear the train_set_en using the function defined above
train_set_en['transcript_clean_en'] = train_set_en['transcript_en'].apply(lambda x: word_replace(x))
train_set_en['transcript_clean_en'] = train_set_en['transcript_clean_en'].apply(lambda x: remove_special_sysmbols(x))
train_set_en['transcript_clean_en'] = train_set_en['transcript_clean_en'].apply(lambda x: replace_x_o(x))
train_set_en['transcript_clean_en'] = train_set_en['transcript_clean_en'].apply(lambda x: nline_replace(x))
train_set_en['transcript_clean_en'] = train_set_en['transcript_clean_en'].apply(lambda x: remove_extra_white_spaces(x))
train_set_en['length_transcript_clean_en'] = train_set_en['transcript_clean_en'].apply(lambda x: len(x))

In [None]:
# Clear the train_set_ja using the function defined above
train_set_ja['transcript_clean_ja'] = train_set_ja['transcript_ja'].apply(lambda x: word_replace(x))
train_set_ja['transcript_clean_ja'] = train_set_ja['transcript_clean_ja'].apply(lambda x: remove_special_sysmbols(x))
train_set_ja['transcript_clean_ja'] = train_set_ja['transcript_clean_ja'].apply(lambda x: replace_x_o(x))
train_set_ja['transcript_clean_ja'] = train_set_ja['transcript_clean_ja'].apply(lambda x: nline_replace(x))
train_set_ja['transcript_clean_ja'] = train_set_ja['transcript_clean_ja'].apply(lambda x: remove_extra_white_spaces(x))
train_set_ja['length_transcript_clean_ja'] = train_set_ja['transcript_clean_ja'].apply(lambda x: len(x))

In [None]:
# Clear the train_set_ch using the function defined above
train_set_ch['transcript_clean_ch'] = train_set_ch['transcript_ch'].apply(lambda x: word_replace(x))
train_set_ch['transcript_clean_ch'] = train_set_ch['transcript_clean_ch'].apply(lambda x: remove_special_sysmbols(x))
train_set_ch['transcript_clean_ch'] = train_set_ch['transcript_clean_ch'].apply(lambda x: replace_x_o(x))
train_set_ch['transcript_clean_ch'] = train_set_ch['transcript_clean_ch'].apply(lambda x: nline_replace(x))
train_set_ch['transcript_clean_ch'] = train_set_ch['transcript_clean_ch'].apply(lambda x: remove_extra_white_spaces(x))
train_set_ch['length_transcript_clean_ch'] = train_set_ch['transcript_clean_ch'].apply(lambda x: len(x))


In [None]:
# display the heads of the dataframes
train_set.head()

In [None]:
# display the heads of the dataframes
val_set.head()

In [None]:
# display the heads of the dataframes
test_set.head()

In [None]:
# display the heads of the dataframes
train_set_en.head()

In [None]:
# display the heads of the dataframes
train_set_ja.head()

In [None]:
# display the heads of the dataframes
train_set_ch.head()

## Remove the Korean stop words



In [None]:
# import the Korean stop words
# stop_words = pd.read_csv('korean_stopwords.csv')
# stop_words = stop_words['stopwords'].tolist()

In [None]:
# # remove the Korean stop words from the train_set
# train_set['transcript_clean'] = train_set['transcript_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# train_set['length_transcript_clean'] = train_set['transcript_clean'].apply(lambda x: len(x))

In [None]:
## remove the stop word
stopwords = ["을", "를", "이", "가", "ㅡ", "은", "는", "XXX", "xxx", "어요", "아니", "입니다", "에서", "니까", "으로",
             "근데", "습니다", "습니까", "저희", "합니다", "하고", "싶어요", "있는", "있습니다", "싶습니다", "그냥",
             "고요", "에요", "예요", "으시", "그래서"]

# open and read the file containing comprehensive stopwords 
# stopwords_all = open("stopwords-ko.txt").readlines()

# function to remove the stop word from the train and test dataframe
def get_model_input(_words):
    global stopwords
    _words = [x for x in _words if x[0] not in stopwords]
    _words = [x for x in _words if x[:-1] not in stopwords]

    for i in range(len(_words)-1):
        yield _words[i]

In [None]:
def get_corpus(df):
    corpus = []
    for lwords in df:    
        temp = []
        for x in get_model_input(lwords):
            if len(x) != 1:
                temp.append("{}".format(x))
        corpus.append(" ".join(temp))
    return corpus        

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

train_set_corpus = get_corpus([(mecab.morphs(x)) for x in train_set['transcript_clean']])
val_set_corpus = get_corpus([(mecab.morphs(x)) for x in val_set['transcript_clean']])
test_set_corpus = get_corpus([(mecab.morphs(x)) for x in test_set['transcript_clean']])

train_set_en_corpus = get_corpus([(mecab.morphs(x)) for x in train_set_en['transcript_clean_en']])
train_set_ja_corpus = get_corpus([(mecab.morphs(x)) for x in train_set_ja['transcript_clean_ja']])
train_set_ch_corpus = get_corpus([(mecab.morphs(x)) for x in train_set_ch['transcript_clean_ch']])


In [None]:
# adding the corpus to the dataframe
train_set['corpus'] = train_set_corpus
val_set['corpus'] = val_set_corpus
test_set['corpus'] = test_set_corpus

# adding the corpus to the dataframe
train_set_en['corpus_en'] = train_set_en_corpus
train_set_ja['corpus_ja'] = train_set_ja_corpus
train_set_ch['corpus_ch'] = train_set_ch_corpus

In [None]:
# add the length of the corpus to the dataframe
train_set['length_corpus'] = train_set['corpus'].apply(lambda x: len(x))
val_set['length_corpus'] = val_set['corpus'].apply(lambda x: len(x))
test_set['length_corpus'] = test_set['corpus'].apply(lambda x: len(x))

# add the length of the corpus to the dataframe
train_set_en['length_corpus_en'] = train_set_en['corpus_en'].apply(lambda x: len(x))
train_set_ja['length_corpus_ja'] = train_set_ja['corpus_ja'].apply(lambda x: len(x))
train_set_ch['length_corpus_ch'] = train_set_ch['corpus_ch'].apply(lambda x: len(x))


In [None]:
# display the heads of the dataframes
train_set.head()


In [None]:
# display the heads of the dataframes
val_set.head()

In [None]:
# display the heads of the dataframes   
test_set.head()

In [None]:
# display the heads of the dataframes
train_set_en.head()

In [None]:
# display the heads of the dataframes
train_set_ja.head()

In [None]:
# display the heads of the dataframes
train_set_ch.head()

In [None]:
# save the train_set, val_set and test_set to csv files
train_set.to_csv('train_set_clean.csv', index=False)
val_set.to_csv('val_set_clean.csv', index=False)
test_set.to_csv('test_set_clean.csv', index=False)

# save the train_set_en, train_set_ja and train_set_ch to csv files
train_set_en.to_csv('train_set_en_clean.csv', index=False)
train_set_ja.to_csv('train_set_ja_clean.csv', index=False)
train_set_ch.to_csv('train_set_ch_clean.csv', index=False)

In [None]:
# plot the word cloud of the train_set corpus with Korean font 
wordcloud = WordCloud(font_path='NanumGothic.ttf', width = 800, height = 800, 
                background_color ='white', 
                min_font_size = 10).generate(' '.join(train_set['corpus']))

# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

# Vectorization of the dataset

In [None]:
def vectorize_dataset(df_train, df_test, df_validation):
    # Convert a collection of raw documents to a matrix of TF-IDF features.
    tfidf = TfidfVectorizer(# analyzer='word', # default='word'
                            ngram_range=(1, 3), #(2, 6), # 3,6
                            max_df=0.5, #ignoring terms that have a document frequency higher than 0.5
                            # min_df=2, # 10
                            max_features=2000, #300, 500, 10000 (this will limit the vocabulary)
                            sublinear_tf=True, #replaces tf with 1 + log(tf), twenty occurrences of a term in a document
                                                #does not represent twenty times the significance of a single occurrence
                            use_idf=True
    )

    #Converting the sparse matrix into an array
    #We then apply the toarray function to convert the sparse matrix into an array.
    X_train = tfidf.fit_transform(df_train['corpus']).toarray()
    X_test = tfidf.transform(df_test['corpus']).toarray()
    X_validation = tfidf.fit_transform(df_validation['corpus']).toarray()

    y_train, y_validation, y_test = df_train['label'], df_validation['label'], df_test['label']
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test

In [None]:
# vectorize the dataset
X_train, X_validation, X_test, y_train, y_validation, y_test = vectorize_dataset(train_set, test_set, val_set)

#set the evaluation set for early stop models
eval_set = [(X_validation, y_validation)]

# Training ML models with imbalanced dataset

## TRAINING CODE 1

In [None]:
# # function to train the ML models
# def train_ml_model(X_train, X_validation, X_test, y_train, y_validation, y_test, model):
#     start = timer()
#     model[1].fit(X_train, y_train)
#     end = timer()
#     training_time = end - start
#     y_pred = model[1].predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred, average='macro')
#     recall = recall_score(y_test, y_pred, average='macro')
#     f1 = f1_score(y_test, y_pred, average='macro')
#     f2 = fbeta_score(y_test, y_pred, beta=2.0, average='macro')
#     roc_auc = roc_auc_score(y_test, y_pred, average='macro')
#     matthews_corrcoef = matthews_corrcoef(y_test, y_pred)
#     cohen_kappa = cohen_kappa_score(y_test, y_pred)
#     cm = confusion_matrix(y_test, y_pred)
#     cr = classification_report(y_test, y_pred)
#     return training_time, accuracy, precision, recall, f1, f2, roc_auc, matthews_corrcoef, cohen_kappa, cm, cr

In [None]:
# def train_ml_model(X_train, X_validation, X_test, y_train, y_validation, y_test, model):
#     # start the timer
#     start = timer()
#     
#     # fit the model
#     model.fit(X_train, y_train)
#     
#     # make predictions for test data
#     y_pred = model.predict(X_test)
#     
#     # stop the timer
#     end = timer()
#     
#     # calculate the training time
#     training_time = end - start
#     
#     # calculate the accuracy score
#     accuracy = accuracy_score(y_test, y_pred)
#     
#     # calculate the precision score
#     precision = precision_score(y_test, y_pred, average='macro')
#     
#     # calculate the recall score
#     recall = recall_score(y_test, y_pred, average='macro')
#     
#     # calculate the f1 score
#     f1 = f1_score(y_test, y_pred, average='macro')
#     
#     # calculate the f2 score
#     f2 = fbeta_score(y_test, y_pred, average='macro', beta=2.0)
#     
#     # calculate the roc auc score
#     roc_auc = roc_auc_score(y_test, y_pred, average='macro', multi_class='ovo')
#     
#     # calculate the matthews correlation coefficient
#     matthews_corrcoef = matthews_corrcoef(y_test, y_pred)
#     
#     # calculate the cohen kappa score
#     cohen_kappa = cohen_kappa_score(y_test, y_pred)
#     
#     # calculate the confusion matrix
#     cm = confusion_matrix(y_test, y_pred)
#     
#     # calculate the classification report
#     cr = classification_report(y_test, y_pred)
#     
#     # return the results
#     return training_time, accuracy, precision, recall, f1, f2, roc_auc, matthews_corrcoef, cohen_kappa, cm, cr

In [None]:
# # function to print the results
# def print_results(training_time, accuracy, precision, recall, f1, f2, roc_auc, matthews_corrcoef, cohen_kappa, cm, cr):
#     print('Training Time: ', training_time)
#     print('Accuracy: ', accuracy)
#     print('Precision: ', precision)
#     print('Recall: ', recall)
#     print('F1: ', f1)
#     print('F2: ', f2)
#     print('ROC AUC: ', roc_auc)
#     print('Matthews Corrcoef: ', matthews_corrcoef)
#     print('Cohen Kappa: ', cohen_kappa)
#     print('Confusion Matrix: \n', cm)
#     print('Classification Report: \n', cr)

In [None]:
# # define the ML models
# models = [
#     ('MultinomialNB', MultinomialNB()),
#     ('GaussianNB', GaussianNB()),
#     ('RandomForestClassifier', RandomForestClassifier()),
#     ('DecisionTreeClassifier', DecisionTreeClassifier()),
#     ('LogisticRegression', LogisticRegression()),
#     ('XGBClassifier', XGBClassifier()),
#     ('LGBMClassifier', LGBMClassifier()),
#     ('SVC', SVC())
# ]

In [None]:
# # train the ML models considering 'MultinomialNB' object is not subscriptable
# for name, model in models:
#     print('Training ', name, '...')
#     training_time, accuracy, precision, recall, f1, f2, roc_auc, matthews_corrcoef, cohen_kappa, cm, cr = train_ml_model(X_train, X_validation, X_test, y_train, y_validation, y_test, model)
#     print_results(training_time, accuracy, precision, recall, f1, f2, roc_auc, matthews_corrcoef, cohen_kappa, cm, cr)
#     print('Training ', name, 'completed.')
#     print('--------------------------------------------------')
# 
# # for name, model in models:
# #     print('Training ', name, '...')
# #     training_time, accuracy, precision, recall, f1, f2, roc_auc, matthews_corrcoef, cohen_kappa, cm, cr = train_ml_model(X_train, X_validation, X_test, y_train, y_validation, y_test, model)
# #     print_results(training_time, accuracy, precision, recall, f1, f2, roc_auc, matthews_corrcoef, cohen_kappa, cm, cr)
# #     print('Training ', name, 'completed.')
# #     print('--------------------------------------------------')

In [None]:
# # function to plot the confusion matrix
# def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
#     # plot the confusion matrix
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()
#     tick_marks = np.arange(len(classes))
#     
#     # plot the ticks
#     plt.xticks(tick_marks, classes, rotation=45)
#     plt.yticks(tick_marks, classes)
#     
#     # plot the text
#     fmt = 'd'
#     thresh = cm.max() / 2.
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#             plt.text(j, i, format(cm[i, j], fmt),
#                      horizontalalignment="center",
#                      color="white" if cm[i, j] > thresh else "black")
#             
#     # plot the labels
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')
#     plt.tight_layout()

## TRAINING CODE 2

In [None]:
# def get_metrics(y_test, y_test_pred, y_test_proba, threshold=0.5):
#     # y_pred_class = y_test_proba > threshold
#     y_pred_class = y_test_pred
# 
#     tn, fp, fn, tp = confusion_matrix(y_test, y_pred_class).ravel()
# 
#     true_positive_rate = tp / (tp + fn)
#     true_negative_rate = tn / (tn + fp)
#     positive_predictive_value = tp / (tp + fp)
#     negative_predictive_value = tn / (tn + fn)
#     false_positive_rate = fp / (fp + tn)
#     false_negative_rate = fn / (tp + fn)
#     false_discovery_rate = fp / (tp + fp)
# 
#     scores = {'Accuracy': accuracy_score(y_test, y_pred_class),
#               'Precision': precision_score(y_test, y_pred_class),
#               'Recall': recall_score(y_test, y_pred_class),
#               'F1_score': f1_score(y_test, y_pred_class),
#               'F05_score': fbeta_score(y_test, y_pred_class, beta=0.5),
#               'F2_score': fbeta_score(y_test, y_pred_class, beta=2),
#               'Roc auc score': roc_auc_score(y_test, y_test_proba),
#               'Matthews_corrcoef': matthews_corrcoef(y_test, y_pred_class),
#               'Cohen_kappa': cohen_kappa_score(y_test, y_pred_class),
#               'True_positive_rate': true_positive_rate,
#               'True_negative_rate': true_negative_rate,
#               'Positive_predictive_value': positive_predictive_value,
#               'Negative_predictive_value': negative_predictive_value,
#               'False_positive_rate': false_positive_rate,
#               'False_negative_rate': false_negative_rate,
#               'False_discovery_rate': false_discovery_rate,
#               }
# 
#     return scores

## Define the ML models

In [None]:
# # define the ML models
# def define_models():
#     models = [] #list to save the initiated models
#     
# #     # Gaussian Naive Bayes
# #     # gnb assumes them to be continuous
# #     gnb = GaussianNB()
# 
# #     # Mltinomial Naive Bayes
# #     # nb assumes the features are discrete
# #     mnb = MultinomialNB()
# 
# #     #logistic regression
# #     lr = LogisticRegression(
# #         # C=10,# random_state=1234
# #     )
# 
# #     #Decision Tree
#     dt = DecisionTreeClassifier(
#         # max_depth=10, 
#         # random_state=1234
#     )
# 
#     #Random Forest
#     rf = RandomForestClassifier(
#         # n_estimators=100,
#         # max_depth=20,
#         # max_features=0.06,
#         # n_jobs=6,
#         # random_state=1234
#     )
# 
#     #XGBoost
#     xgb = XGBClassifier(
#         early_stopping_rounds=10,
#         verbosity=2,
#         # n_estimators=2000,
#         # tree_method='hist',
#         # subsample=0.67,
#         # colsample_level=0.06,
#         # n_jobs=6,
#         # random_state=1234
#     )
# 
#     #LightGBM
#     lgbm = LGBMClassifier(
#         early_stopping_rounds=10,
#         verbosity=2,
#         # boost_from_average=False
#         # num_leaves=64,
#         # n_estimators=2000,
#         # feature_fraction=0.06,
#         # bagging_fraction=0.67,
#         # bagging_freq=1,
#         # n_jobs=6,
#         # random_state=1234
#     )
#     
#     #add the models in the list
#     models = [dt, rf, xgb, lgbm]
#     
#     # #to specify which of our models require early stopping within the .fit() method.
#     es_models = ['XGBClassifier', 'LGBMClassifier']
#     
#     return models, es_models

## Train the ML models

In [None]:
# # train the ML models
# def train_models(models, es_models, X_train, X_test, y_train, y_test, eval_set):
#     # dictionary to save the results
#     results = {}
#     
#     # loop through the models
#     for model in models:
#         # get the name of the model
#         name = model.__class__.__name__
#         
#         # check if the model requires early stopping
#         if name in es_models:
#             # fit the model
#             model.fit(X_train, y_train, eval_set=eval_set)
#         else:
#             # fit the model
#             model.fit(X_train, y_train)
#         
#         # make predictions for test data
#         y_pred = model.predict(X_test)
#         
#         # make predictions for test data
#         y_pred_proba = model.predict_proba(X_test)[:, 1]
#         
#         # get the metrics
#         scores = get_metrics(y_test, y_pred, y_pred_proba)
#         
#         # add the results to the dictionary
#         results[name] = scores
#         
#         # print the results
#         print(name, 'completed.')
#         print('--------------------------------------------------')
#         metric_scores = get_metrics(y_test, y_pred, y_pred_proba)
#         for metric_name, score in metric_scores.items():
#             print('{} :{}'.format(metric_name, score))
#         print('#'*80)
#         
#     return results

In [None]:
# # Train the ML models
# models, es_models = define_models()
# results = train_models(models, es_models, X_train, X_test, y_train, y_test, eval_set)

In [None]:
# # function to print the results
# def print_results(results):
#     for name, scores in results.items():
#         print(name, 'results')
#         print('--------------------------------------------------')
#         print('Accuracy: ', scores['Accuracy'])
#         print('Precision: ', scores['Precision'])
#         print('Recall: ', scores['Recall'])
#         print('F1: ', scores['F1_score'])
#         print('F0.5: ', scores['F05_score'])
#         print('F2: ', scores['F2_score'])
#         print('ROC AUC: ', scores['Roc auc score'])
#         print('Matthews Corrcoef: ', scores['Matthews_corrcoef'])
#         print('Cohen Kappa: ', scores['Cohen_kappa'])
#         print('True Positive Rate: ', scores['True_positive_rate'])
#         print('True Negative Rate: ', scores['True_negative_rate'])
#         print('Positive Predictive Value: ', scores['Positive_predictive_value'])
#         print('Negative Predictive Value: ', scores['Negative_predictive_value'])
#         print('False Positive Rate: ', scores['False_positive_rate'])
#         print('False Negative Rate: ', scores['False_negative_rate'])
#         print('False Discovery Rate: ', scores['False_discovery_rate'])
#         print('--------------------------------------------------')
#         
# # print the results
# print_results(results)

In [None]:
# # function to plot the confusion matrix
# def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
#     # plot the confusion matrix
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()
#     tick_marks = np.arange(len(classes))
#     
#     # plot the ticks
#     plt.xticks(tick_marks, classes, rotation=45)
#     plt.yticks(tick_marks, classes)
#     
#     # plot the text
#     fmt = 'd'
#     thresh = cm.max() / 2.
#     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#             plt.text(j, i, format(cm[i, j], fmt),
#                      horizontalalignment="center",
#                      color="white" if cm[i, j] > thresh else "black")
#             
#     # plot the labels
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')
#     plt.tight_layout()

In [None]:
# # plot the confusion matrix
# plot_confusion_matrix(results['DecisionTreeClassifier']['Confusion Matrix'], classes=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], title='Confusion matrix')

# TRAINING CODE 3

In [None]:
def get_metrics(y_test, y_test_pred, y_test_proba, threshold=0.5):
    # y_pred_class = y_test_proba > threshold
    y_pred_class = y_test_pred

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_class).ravel()

    true_positive_rate = tp / (tp + fn)
    true_negative_rate = tn / (tn + fp)
    positive_predictive_value = tp / (tp + fp)
    negative_predictive_value = tn / (tn + fn)
    false_positive_rate = fp / (fp + tn)
    false_negative_rate = fn / (tp + fn)
    false_discovery_rate = fp / (tp + fp)

    scores = {'Accuracy': accuracy_score(y_test, y_pred_class),
              'Precision': precision_score(y_test, y_pred_class),
              'Recall': recall_score(y_test, y_pred_class),
              'F1_score': f1_score(y_test, y_pred_class),
              'F05_score': fbeta_score(y_test, y_pred_class, beta=0.5),
              'F2_score': fbeta_score(y_test, y_pred_class, beta=2),
              'Roc auc score': roc_auc_score(y_test, y_test_proba),
              'Matthews_corrcoef': matthews_corrcoef(y_test, y_pred_class),
              'Cohen_kappa': cohen_kappa_score(y_test, y_pred_class),
              'True_positive_rate': true_positive_rate,
              'True_negative_rate': true_negative_rate,
              'Positive_predictive_value': positive_predictive_value,
              'Negative_predictive_value': negative_predictive_value,
              'False_positive_rate': false_positive_rate,
              'False_negative_rate': false_negative_rate,
              'False_discovery_rate': false_discovery_rate,
              }

    return scores

## Define the ML models

In [None]:
# define the ML models
def define_models():
    models = [] #list to save the initiated models
    
#     # Gaussian Naive Bayes
#     # gnb assumes them to be continuous
#     gnb = GaussianNB()

#     # Mltinomial Naive Bayes
#     # nb assumes the features are discrete
#     mnb = MultinomialNB()

#     #logistic regression
#     lr = LogisticRegression(
#         # C=10,# random_state=1234
#     )

#     #Decision Tree
    dt = DecisionTreeClassifier(
        # max_depth=10, 
        # random_state=1234
    )

    #Random Forest
    rf = RandomForestClassifier(
        # n_estimators=100,
        # max_depth=20,
        # max_features=0.06,
        # n_jobs=6,
        # random_state=1234
    )

    #XGBoost
    xgb = XGBClassifier(
        early_stopping_rounds=10,
        verbosity=2,
        # n_estimators=2000,
        # tree_method='hist',
        # subsample=0.67,
        # colsample_level=0.06,
        # n_jobs=6,
        # random_state=1234
    )

    #LightGBM
    lgbm = LGBMClassifier(
        early_stopping_rounds=10,
        verbosity=2,
        # boost_from_average=False
        # num_leaves=64,
        # n_estimators=2000,
        # feature_fraction=0.06,
        # bagging_fraction=0.67,
        # bagging_freq=1,
        # n_jobs=6,
        # random_state=1234
    )
    
    #add the models in the list
    models = [dt, rf, xgb, lgbm]
    
    # #to specify which of our models require early stopping within the .fit() method.
    es_models = ['XGBClassifier', 'LGBMClassifier']
    
    return models, es_models

## Train the ML models

In [None]:
# Setting up our results dataframe
df_results = pd.DataFrame(columns=['Model', 'F1_score', 'Precision', 'Recall', 'ROC AUC', 'Accuracy', 'Training time', 'Test time'])

In [None]:
#  Function to plot the confusion matrix
# https://github.com/prateeksawhney97/MNIST-Classification-Multinomial-vs-Gaussian-Naive-Bayes/blob/master/MNIST%20Classification.ipynb

import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
# Function to train the ML models
def train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling="", bt_name=""):   
    #retrievce the models' name
    model_names = [i.__class__.__name__ for i in models]
    #set the number of classes for the confusion matrix
    classes_labels = np.arange(2)
    #name for oversampling method
    over_name = ''

    # Dataset shape
    print("-- Dataset shape --")
    print('Original dataset shape')
    print(Counter(y_train))#Counting the samples in training set
    
    # fit and apply the transform to the training data_set is set(dataset oversampling)
    oversample = sampling
    if oversample:
        over_name = oversample.__class__.__name__
        X_train_over, y_train_over = oversample.fit_resample(X_train, y_train) #resampling the dataset
        X_train= X_train_over # assign new sample to trainset, text
        y_train = y_train_over #assign new sample to trainset, label
        print('Resampled dataset shape %s' % Counter(y_train))
    
    print('-'*50)  
    start = timer()
    #Train each of our candidate models
    for m, n in zip(models, model_names):
        # print('-'*50)
        
        train_time = 0
        test_time = 0
        train_predict_time = 0
        test_predict_time = 0

        print("\n"+n+"_"+over_name+bt_name) #print the model name
        if n in es_models:
            t0 = time()
            m.fit(X_train, y_train, 
                  eval_set = eval_set, 
                  # early_stopping_rounds=15,
                  # verbose=0
                 )
            train_time = time() - t0
        else:
            t0 = time()
            m.fit(X_train, y_train)
            train_time = time() - t0

        print(f"\nTraining time: {round(train_time, 3)}sec")

        #Model training prediction
        t0 = time()
        train_score = m.score(X_train, y_train)
        train_predict_time = time() - t0
        print(f"Prediction time (train): {round(train_predict_time, 3)}sec")

        #Model test prediction
        t0 = time()
        test_score = m.score(X_test, y_test)
        test_predict_time = time() - t0
        print(f"Prediction time (test): {round(test_predict_time, 3)}sec")

        #Check for Overfitting
        print('\n-- Check for Overfitting --')
        #print the scores on training and test set
        print('Train set score: {:.4f} sec '.format(train_score))
        print('Test set score: {:.4f} sec'.format(test_score))

        # Evaluate the trained model on the test set
        '''
        Use 'predict' for binary or multi-class classification problems when you want to get
        the predicted class label(s) for input data.
        '''
        t0 = time()
        predicted = m.predict(X_test) #return the label of the test set samples (actual class)
        test_time = time() - t0
        print(f"Prediction time (test): {round(test_time, 3)}sec")
        # print('f1 score____ :', f1_score(y_test, predicted))

        # accuracy of the model on the test set
        # test_acc0 = np.mean(predicted == y_test)
        # print('Model Accuracy on test set (Mean method) {:.4f} sec'.format(test_acc0))
        # # accuracy of the model on the test set
        # test_acc1 = accuracy_score(predicted, y_test)
        # print('Model Accuracy on test set (acc_score method) {:.4f} sec'.format(test_acc1))
        # test_acc2 = accuracy_score(y_test, predicted)
        # print('Model Accuracy on test set (acc_score method_reverse): {0:0.4f} sec'.format(test_acc2))

        # classification report
        print("\n-- classification Report --")
        print(classification_report(y_test, predicted)) 

        #Confusion-matrix
        '''
        Print the Confusion Matrix and slice it into four pieces
        '''
        cm = confusion_matrix(y_test, predicted)
        print('\n-- Confusion matrix --\n', cm)
        print('True Positives(TP) = ', cm[0,0])
        print('True Negatives(TN) = ', cm[1,1])
        print('False Positives(FP) = ', cm[0,1])
        print('False Negatives(FN) = ', cm[1,0])

        # plot_confusion_matrix(cm,
        #                       classes=classes_labels,
        #                       normalize=False,
        #                       title="Consfusion Matrix",
        #                       cmap=plt.cm.Blues
        #                      )
        # print('-'*80)

        '''
        Use 'predict_proba' for binary or multi-class classification problems when you want to get
        the probability estimates for each possible class label.
        '''
        t0 = time()
        y_test_proba = m.predict_proba(X_test)[:, 1] #return the probability estimates for each possible class label (the class probabilities)
        test_time = time() - t0
        
        print('\n-- Metrics scores --\n')
        metric_scores = get_metrics(y_test, predicted, y_test_proba)
        for metric_name, score in metric_scores.items():
            print('{} :{}'.format(metric_name, score))
        print('#'*80)
        
        #save the results in the dataframe df_results    
        df_results.loc[len(df_results.index)] = [n+"_"+over_name+bt_name, metric_scores.get("F1_score"), metric_scores.get("Precision"), metric_scores.get("Recall"), metric_scores.get("Roc auc score"), test_score, train_time, test_time]
        del m #delete the model

    train_test_time = timer() - start
    print('Training and testing time of all models {:.4f} seconds'.format(train_test_time))
    
    return df_results

In [None]:
#define the models
models, es_models = define_models()

#train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test)

In [None]:
# Display the results
df_results

## Training model with resampling methods

In [None]:
# import the train_set, test_set, val_set
train_set = pd.read_csv('train_set_clean.csv')
test_set = pd.read_csv('test_set_clean.csv')
val_set = pd.read_csv('val_set_clean.csv')

In [None]:
plot_class_distribution(train_set, 'Train Dataset Class Distribution')
plot_class_distribution(test_set, 'Train Dataset Class Distribution')
plot_class_distribution(val_set, 'Train Dataset Class Distribution')

In [None]:
X_train, X_validation, X_test, y_train, y_validation, y_test = vectorize_dataset(train_set, test_set, val_set)

#set the evaluation set for early stop models
eval_set = [(X_validation, y_validation)]

### Random Over-sampling
RandomOverSampler will increase the data samples in the minority class (vishing). It makes the minority class have the same data samples as the majority class (non-vishing).

In [None]:
from imblearn.over_sampling import RandomOverSampler

# define RandomOverSampler strategy
sampling = RandomOverSampler(random_state=42) #sampling_strategy='minority', sampling_strategy parameter is to balance the class to have 1:1 data samples

# Train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling)

In [None]:
# Display the results
df_results

### SMOTE Over-sampling methods

In [None]:
from imblearn.over_sampling import SMOTE, KMeansSMOTE , ADASYN,SVMSMOTE,KMeansSMOTE,BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

In [None]:
# define SMOTE strategy
sampling = SMOTE(random_state=42) #sampling_strategy='minority', sampling_strategy parameter is to balance the class to have 1:1 data samples

# Train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling)

# Display the results
df_results

### Adaptive Synthetic (ADASYN) method

In [None]:
# define ADASYN strategy
sampling = ADASYN(random_state=42) #sampling_strategy='minority', sampling_strategy parameter is to balance the class to have 1:1 data samples

# Train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling)

# Display the results
df_results

### KMeansSMOTE method
KMeans clustering before to over-sample using SMOTE.

In [None]:
# define KMeansSMOTE strategy
sampling = KMeansSMOTE(random_state=42) #sampling_strategy='minority', sampling_strategy parameter is to balance the class to have 1:1 data samples

# Train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling)

# Display the results
df_results

### Borderline SMOTE method

In [None]:
# define BorderlineSMOTE strategy
sampling = BorderlineSMOTE(random_state=42) #sampling_strategy='minority', sampling_strategy parameter is to balance the class to have 1:1 data samples

# Train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling)

# Display the results
df_results

### SMOTE and cleaning using ENN method

In [None]:
# define SMOTEENN strategy
sampling = SMOTEENN(random_state=42) #sampling_strategy='minority', sampling_strategy parameter is to balance the class to have 1:1 data samples

# Train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling)

# Display the results
df_results

### SMOTETomek method 

In [None]:
# define SMOTETomek strategy
sampling = SMOTETomek(random_state=42) #sampling_strategy='minority', sampling_strategy parameter is to balance the class to have 1:1 data samples

# Train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling)

# Display the results
df_results

### SVMSMOTE method 

In [None]:
# define SVMSMOTE strategy
sampling = SVMSMOTE(random_state=42) #sampling_strategy='minority', sampling_strategy parameter is to balance the class to have 1:1 data samples

# Train the models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, sampling)

# Display the results
df_results

## Training models with Back-translation as text augmentation methods

In [None]:
# import the train_set, test_set, val_set
train_set = pd.read_csv('train_set_clean.csv')
test_set = pd.read_csv('test_set_clean.csv')
val_set = pd.read_csv('val_set_clean.csv')

In [None]:
train_set.head()

In [None]:
# plot the class distribution of the train_set dataset
plot_class_distribution(train_set, 'Train Dataset Class Distribution')

In [None]:
# import the augmented dataset from back-translation method
train_set_en = pd.read_csv('train_set_en_clean.csv')
train_set_ch = pd.read_csv('train_set_ch_clean.csv')
train_set_ja = pd.read_csv('train_set_ja_clean.csv')

In [None]:
train_set_en

### Training with the English-Korean augmented dataset

In [None]:
# # concatenate train_set and train_set_en dataset and make a new dataset with only train_set['corpus'] and train_set['label'] columns. Add train_set_en['corpus_en'] and train_set_en['label'] columns to the new dataset.
# train_set_en0 = train_set_en[['corpus_en', 'label']]
# train_set_en0.columns = ['corpus', 'label']
# train_set_en0

# # concatenate the augmented train_set_en dataset with the original dataset to create the new train_set dataset for training the models with back-translation method as text augmentation method.  
# train_set00 = pd.concat([train_set, train_set_en], ignore_index=True)
# train_set00

# Make new train_set dataset with only train_set['corpus'] and train_set['label'] columns.
train_set_new = train_set[['corpus', 'label']]
train_set_new.columns = ['corpus', 'label']
train_set_new

In [None]:
# Make new train_set_en dataset with only train_set_en['corpus_en'] and train_set_en['label'] columns. Rename the columns to 'corpus_en' to 'corpus'.
train_set_en_new = train_set_en[['corpus_en', 'label']]
train_set_en_new.columns = ['corpus', 'label']
train_set_en_new

In [None]:
# concatenate the augmented train_set_en dataset with the original dataset to create the new train_set dataset for training the models with back-translation method as text augmentation method.
train_set_EnKo = pd.concat([train_set_new, train_set_en_new], ignore_index=True)
train_set_EnKo

In [None]:
# plot the class distribution of the new train_set dataset 
plot_class_distribution(train_set_EnKo, 'En-Ko Augmented Train Set Class Distribution')

In [None]:
# vectorize the dataset
X_train, X_validation, X_test, y_train, y_validation, y_test = vectorize_dataset(train_set_EnKo, test_set, val_set)

#set the evaluation set for early stop models
eval_set = [(X_validation, y_validation)]

### Training ML models with En-Ko augmented dataset

In [None]:
# define the ML models
models, es_models = define_models()

# Define the Back-translation name En-Ko
bt_name = 'BT-EnKo'

# train the ML models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, bt_name=bt_name)

# Display the results
df_results

### Training with the Chinese-Korean augmented dataset

In [None]:
# # concatenate train_set and train_set_ch dataset and make a new dataset with only train_set['corpus'] and train_set['label'] columns. Add train_set_ch['corpus_ch'] and train_set_ch['label'] columns to the new dataset.
# train_set_ch0 = train_set_ch[['corpus_ch', 'label']]
# train_set_ch0.columns = ['corpus', 'label']
# train_set_ch0
# 
# # concatenate the augmented train_set_ch dataset with the original dataset to create the new train_set dataset for training the models with back-translation method as text augmentation method.
# train_set0 = pd.concat([train_set, train_set_ch0], ignore_index=True)
# train_set0
# 
# # Make new train_set dataset with only train_set['corpus'] and train_set['label'] columns.
# train_set_new = train_set[['corpus', 'label']]
# train_set_new.columns = ['corpus', 'label']
# train_set_new

# Make new train_set_ch dataset with only train_set_ch['corpus_ch'] and train_set_ch['label'] columns. Rename the columns to 'corpus_ch' to 'corpus'.
train_set_ch_new = train_set_ch[['corpus_ch', 'label']]
train_set_ch_new.columns = ['corpus', 'label']
train_set_ch_new

In [None]:
# concatenate the augmented train_set_ch dataset with the original dataset to create the new train_set dataset for training the models with back-translation method as text augmentation method.
train_set_ChKo = pd.concat([train_set_new, train_set_ch_new], ignore_index=True)
train_set_ChKo

In [None]:

# plot the class distribution of the new train_set dataset
plot_class_distribution(train_set_ChKo, 'Ch-Ko Augmented Train Set Class Distribution')

In [None]:
# vectorize the dataset
X_train, X_validation, X_test, y_train, y_validation, y_test = vectorize_dataset(train_set_ChKo, test_set, val_set)

#set the evaluation set for early stop models
eval_set = [(X_validation, y_validation)]

### Training ML models with Ch-Ko augmented dataset

In [None]:
# define the ML models
models, es_models = define_models()

# Define the Back-translation name En-Ko
bt_name = 'BT-ChKo'

# train the ML models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, bt_name=bt_name)

# Display the results
df_results

### Training with the Japanese-Korean augmented dataset

In [None]:
# Make new train_set_ja dataset with only train_set_ja['corpus_ja'] and train_set_ja['label'] columns. Rename the columns to 'corpus_ja' to 'corpus'.
train_set_ja_new = train_set_ja[['corpus_ja', 'label']]
train_set_ja_new.columns = ['corpus', 'label']
train_set_ja_new

In [None]:
# concatenate the augmented train_set_ja dataset with the original dataset to create the new train_set dataset for training the models with back-translation method as text augmentation method.
train_set_JaKo = pd.concat([train_set_new, train_set_ja_new], ignore_index=True)
train_set_JaKo

In [None]:
# plot the class distribution of the new train_set dataset
plot_class_distribution(train_set_JaKo, 'Ja-Ko Augmented Train Set Class Distribution')

In [None]:
# vectorize the dataset
X_train, X_validation, X_test, y_train, y_validation, y_test = vectorize_dataset(train_set_JaKo, test_set, val_set)

### Training ML models with Ja-Ko augmented dataset

In [None]:
# define the ML models
models, es_models = define_models()

# Define the Back-translation name En-Ko
bt_name = 'BT-JaKo'

# train the ML models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, bt_name=bt_name)

# Display the results
df_results

## Training models with combination of all back-translated datasets (EnKo, ChKo, JaKo)

In [None]:
# concatenate the augmented train_set_new, train_set_en_new, train_set_ch_new, train_set_ja_new dataset to a new dataset

train_set_all = pd.concat([train_set_new, train_set_en_new, train_set_ch_new, train_set_ja_new], ignore_index=True)
train_set_all

# plot the class distribution of the new train_set dataset
plot_class_distribution(train_set_all, 'All BT Augmented Train Set Class Distribution')

In [None]:
# vectorize the dataset
X_train, X_validation, X_test, y_train, y_validation, y_test = vectorize_dataset(train_set_all, test_set, val_set)

#set the evaluation set for early stop models
eval_set = [(X_validation, y_validation)]

### Training ML model with Original+EnKo+ChKo+JaKo dataset 

In [None]:
# define the ML models
models, es_models = define_models()

# Define the Back-translation name
bt_name = 'BT-All'

# train the ML models
df_results = train_models_sampling(models, es_models, X_train, y_train, eval_set, X_test, y_test, bt_name=bt_name)

# Display the results
df_results

In [None]:
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

# use nested cross validation to evaluate the model with the best hyperparameters on the test set 
# define the model
model = XGBClassifier(
    early_stopping_rounds=10,
    verbosity=2,
    # n_estimators=2000,
    # tree_method='hist',
    # subsample=0.67,
    # colsample_level=0.06,
    # n_jobs=6,
    # random_state=1234
)

# define the evaluation procedure
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)

# define the search
grid = dict()
grid['n_estimators'] = [100, 500, 1000]
grid['max_depth'] = [10, 20, 30]
grid['learning_rate'] = [0.001, 0.01, 0.1]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['colsample_bytree'] = [0.5, 0.7, 1.0]

# define the search using nested cross-validation
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv_inner, refit=True)

# configure the cross-validation procedure
cv_outer = KFold(n_splits=3, shuffle=True, random_state=1)

In [None]:
# execute the nested cross-validation
scores = cross_val_score(search, X_train, y_train, scoring='accuracy', cv=cv_outer, n_jobs=-1)

In [None]:
from numpy import mean, std

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
# summarize the best hyperparameter combination
print('Best Hyperparameters: %s' % search.best_params_)


In [None]:
# report the best configuration
print('Config: %s' % search.best_params_)

In [None]:
# report all configurations
means = search.cv_results_['mean_test_score']
params = search.cv_results_['params']
for mean, param in zip(means, params):
    print(">%.3f with: %r" % (mean, param))

In [None]:
#fit the model
search.fit(X_train, y_train)

In [None]:
# evaluate the model
yhat = search.predict(X_test)

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
# use nested cross validation to evaluate the model with the best hyperparameters on the test set
# define the model
model = LGBMClassifier(
    early_stopping_rounds=10,
    verbosity=2,
    # boost_from_average=False
    # num_leaves=64,
    # n_estimators=2000,
    # feature_fraction=0.06,
    # bagging_fraction=0.67,
    # bagging_freq=1,
    # n_jobs=6,
    # random_state=1234
)
    


## Training model with feature selection

In [None]:
# import the train_set, test_set, val_set
train_set = pd.read_csv('train_set_clean.csv')
test_set = pd.read_csv('test_set_clean.csv')
val_set = pd.read_csv('val_set_clean.csv')

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif


# define the feature selection method
def select_features(X_train, y_train, X_test):
    # configure to select all features
    fs = SelectKBest(score_func=f_classif, k='all')
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
    X_train_fs = fs.transform(X_train)
    # transform test input data
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs

In [None]:
# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)

In [None]:
# what are scores for the features
for i in range(len(fs.scores_)):
    print('Feature %d: %f' % (i, fs.scores_[i]))

In [None]:
# plot the scores
plt.bar([i for i in range(len(fs.scores_))], fs.scores_)
plt.show()

In [None]:
# define the model
model = XGBClassifier(
    # early_stopping_rounds=10,
    verbosity=2,
    # n_estimators=2000,
    # tree_method='hist',
    # subsample=0.67,
    # colsample_level=0.06,
    # n_jobs=6,
    # random_state=1234
)

In [None]:
# fit the model
model.fit(X_train_fs, y_train)

In [None]:
# evaluate the model
yhat = model.predict(X_test_fs)

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
# define the model
model = LGBMClassifier(
    # early_stopping_rounds=10,
    verbosity=2,
    # boost_from_average=False
    # num_leaves=64,
    # n_estimators=2000,
    # feature_fraction=0.06,
    # bagging_fraction=0.67,
    # bagging_freq=1,
    # n_jobs=6,
    # random_state=1234
)

In [None]:
# fit the model
model.fit(X_train_fs, y_train)

In [None]:
# evaluate the model
yhat = model.predict(X_test_fs)

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))    

#