# Package preparation

In [1]:
# standard
import numpy as np
import pandas as pd

# visualize
from tqdm.notebook import tqdm
from itertools import product

import multiprocessing

# system
import pickle     ## saving library
import os         ## file manager
import sys
from multiprocessing import Pool

import re         ## preprocessing text library
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')     # download toolkit for textblob.TextBlob.words

from textblob import TextBlob
from nltk.stem import PorterStemmer     # tranform expanding words of words like attacker, attacked, attacking -> attack
st = PorterStemmer()

stop_words = stopwords.words('english')
stop = ["a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount",  "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as",  "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "isnt", "it", "its", "itself", "keep", "keeps", "kept", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "names", "named", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "ok", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "puts", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "sees", "serious", "several", "she", "should", "show", "shows", "showed", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Working dir
work_path = "./data"
checkpoint_path = work_path + "checkpoint/"

# create checkpoint path if not exists
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)

# Preprocessing tool 

In [3]:
class preprocess_text():
    def preprocess_Abstract(self, text):
        if type(text) == str:
            text = text.lower()
            text = re.sub("[\d+]", "", text)
            text = re.sub("[^a-z]", " ", text)

            filtered_sentence = [] 
            for w in text.split(" "): 
                if w not in stop_words and w not in stop:
                    filtered_sentence.append(w)
            text = " ".join(filtered_sentence)
            text = text.strip()
            text = re.sub("[ ]{2,}", " ", text)
        return text

    def preprocess_Title(self, text):
        return self.preprocess_Abstract(text)

    def preprocess_Aims(self, text):
        return self.preprocess_Abstract(text)
    
    def preprocess_Keywords(self, text):
        return self.preprocess_Abstract(text)

class preprocess_tool():
    def __init__(self, tool_preprocessText = preprocess_text()):
        self.tool_preprocessText = tool_preprocessText

    def get_preprocessed_data(self, dataframe, 
                              preprocess_columns = ['title', 'abstract', 'keywords'],
                              preprocessing_type = ['Title', 'Abstract', 'Keywords'],
                              keep_columns = ['itr'],
                              n_jobs=4):
        '''-Parameters:
              preprocess_columns: choosen columns to apply preprocessing method have definded in class preprocess_text() format.
              preprocessing_type: preprocessing methods are applied to respective preprocessing columns have definded in class preprocess_text() format.
              keep_columns: columns which we do nothing.
           -Return:
              (pandas DataFrame): data after preprocess
        '''########
        output_data = pd.DataFrame(columns=preprocessing_type + keep_columns)
        output_data[preprocessing_type + keep_columns] = dataframe[preprocess_columns + keep_columns]

        if 'Title' in preprocessing_type:
            with Pool(n_jobs) as p:
                output_data['Title'] = p.map(self.tool_preprocessText.preprocess_Title, output_data['Title'])
        if 'Abstract' in preprocessing_type:
            with Pool(n_jobs) as p:
                output_data['Abstract'] = p.map(self.tool_preprocessText.preprocess_Abstract, output_data['Abstract'])
        if 'Keywords' in preprocessing_type:
            with Pool(n_jobs) as p:
                output_data['Keywords'] = p.map(self.tool_preprocessText.preprocess_Keywords, output_data['Keywords'])
        if "Aims" in preprocessing_type:
            with Pool(n_jobs) as p:
                output_data['Aims'] = p.map(self.tool_preprocessText.preprocess_Aims, output_data['Aims'])
        return output_data

def labelling_data(series, category):
    '''-Parameter:
          series(pandas Series): Conference distribution of data.
          category(Int64Index list): category (do not reset_index of aims_content before using this funtion)
        -Return: 
          (np array): label series for data.
    '''########
    label = np.zeros(len(series))
    for i, j in enumerate(category):
        label[series == j] = i
    return label.astype(int)

# Preprocess data

In [4]:
tool_preprocess = preprocess_tool(preprocess_text())

In [7]:
print("Preprocessing train ....")
data_train = pd.read_csv(work_path + "/raw_data/data_splited_train.csv", encoding = "ISO-8859-1")
data_train = tool_preprocess.get_preprocessed_data(
    data_train,
    preprocess_columns = ["title", "abstract", "keywords"],
    preprocessing_type = ["Title", "Abstract", "Keywords"],
    keep_columns = ["itr"],
    n_jobs=multiprocessing.cpu_count()
    )

print("Preprocessing validate ....")
data_validate = pd.read_csv(work_path + "/raw_data/data_splited_validate.csv", encoding = "ISO-8859-1")
data_validate = tool_preprocess.get_preprocessed_data(
    data_validate,
    preprocess_columns = ["title", "abstract", "keywords"],
    preprocessing_type = ["Title", "Abstract", "Keywords"],
    keep_columns = ["itr"],
    n_jobs=multiprocessing.cpu_count()
    )

print("Preprocessing test ....")
data_test = pd.read_csv(work_path + "/raw_data/data_origin_test.csv", encoding = "ISO-8859-1")
data_test = tool_preprocess.get_preprocessed_data(
    data_test,
    preprocess_columns = ["title", "abstract", "keywords"],
    preprocessing_type = ["Title", "Abstract", "Keywords"],
    keep_columns = ["itr"],
    n_jobs=multiprocessing.cpu_count()
    )

print("Preprocessing aims ....")
data_aims = pd.read_csv(work_path + "/raw_data/aims_scopes.csv", encoding = "ISO-8859-1")
data_aims = tool_preprocess.get_preprocessed_data(
    data_aims,
    preprocess_columns = ["aims"],
    preprocessing_type = ["Aims"],
    keep_columns = ["itr"],
    n_jobs=multiprocessing.cpu_count()
    )

data_train['Label'] = labelling_data(data_train["itr"], data_aims["itr"])
data_validate['Label'] = labelling_data(data_validate["itr"], data_aims["itr"])
data_test['Label'] = labelling_data(data_test["itr"], data_aims["itr"])


preprocessed_data_path = checkpoint_path + 'preprocessed_data/'
if not os.path.exists(preprocessed_data_path):
    os.makedirs(preprocessed_data_path)

data_train.to_csv(preprocessed_data_path + "01_train.csv", index=False)
data_validate.to_csv(preprocessed_data_path + "01_validate.csv", index=False)
data_test.to_csv(preprocessed_data_path + "01_test.csv", index=False)
data_aims.to_csv(preprocessed_data_path + "01_aims.csv", index=False)

Preprocessing train ....
