## Preprocess

Here we just show how we preprocess our datasets,

you don't need to run this actually, as we have provided all the preprocessed datasets. :)

In [7]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Seperate the corpus

First of all, we seperate the corpus into 7 parts with its own language.

Because the corpus file is too large, it will almost cause memory crash every time it is completely loaded into the memory.

so we get
- fr_corpus.json
- ko_corpus.json
- es_corpus.json
- en_corpus.json
- it_corpus.json
- de_corpus.json
- ar_corpus.json

In [None]:
# corpus_file
input_file = "Data/corpus.json/corpus.json"

# read json file and split by language
def split_json_by_language(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # save data by language
    language_data = {}

    for item in data:
        lang = item.get("lang")
        
        # choose by language
        if lang:
            if lang not in language_data:
                language_data[lang] = []
            language_data[lang].append(item)
    
    # save data
    for lang, items in language_data.items():
        output_file = f"{lang}_corpus.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(items, f, ensure_ascii=False, indent=4)
        
        print(f"Saved {lang} language data to {output_file}")

# run
split_json_by_language(input_file)


### Filter the json

Then we truncated corpus of longer content, because the front part of the document is usually more related to the topic, which will lose some accuracy, but will be more conducive to our extraction of max_features.

In [None]:
# language list
language_list = ['ar', 'es', 'fr', 'de', 'it', 'ko', 'en']

# filter the json
for lang in language_list:
    input_file = f"{lang}_corpus.json"
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # filter data
    # if the text is too long, only keep the first 30000 characters
    for doc in data:
        if len(doc['text']) > 30000:
            doc['text'] = doc['text'][:30000]

    # 将处理后的法语文档保存到新json文件
    with open(f'{lang}_corpus_filtered.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

#### visualize the length of texts after filtering

In [None]:
with open('spanish_corpus_filtered.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# visualize the text length
lengths = [len(doc['text']) for doc in data]

plt.hist(lengths, bins=30)
plt.xlabel('Text Length')
plt.ylabel('Number of Documents')
plt.title('Text Length Distribution')
plt.show()


### Stemming

- Here we use nltk for stemming, as nltk doesn't support korean language, we use kiwipiepy for korean language

In [14]:
# choose language  
lang = "ko"

In [None]:
import numpy as np
import pandas as pd
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import FrenchStemmer, GermanStemmer, ItalianStemmer, SpanishStemmer, EnglishStemmer, ArabicStemmer

from kiwipiepy import Kiwi  # for Korean

nltk.download('stopwords')

from spacy.lang.ko.stop_words import STOP_WORDS as ko_stop


def get_stemmer_and_stopwords(language):
    """return stemmer and stopwords based on language"""
    if language == 'en':
        return EnglishStemmer(), stopwords.words("english")
    elif language == 'fr':
        return FrenchStemmer(), stopwords.words("french")
    elif language == 'de':
        return GermanStemmer(), stopwords.words("german")
    elif language == 'it':
        return ItalianStemmer(), stopwords.words("italian")
    elif language == 'es':
        return SpanishStemmer(), stopwords.words("spanish")
    elif language == 'ar':
        return ArabicStemmer(), stopwords.words("arabic")
    elif language == 'ko':
        return Kiwi(), list(ko_stop)
    else:
        raise ValueError(f"Unsupported language: {language}")

def process_text(text, stemmer, stop_words, language):
    """Process and tokenize a single text based on language."""
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

    if language != 'ko':
        tokens = word_tokenize(text) 
        tokenized_words = [stemmer.stem(word.lower()) for word in tokens if word.lower() not in stop_words]
    else:
        tokens = stemmer.analyze(text)[0][0] 
        tokenized_words = [word.form for word in tokens if word.tag.startswith('N') and word.form not in stop_words] 
    
    return tokenized_words

def tokenizer(df, col_name, language, mytype):
    """Tokenize the column of a given dataframe based on the language."""
    # get stemmer and stopwords based on language
    stemmer, stop_words = get_stemmer_and_stopwords(language)
    
    # create a new column with tokenized text
    new_df = df.copy()
    new_col_name = col_name + "_token"
    new_df[new_col_name] = new_df[col_name].apply(lambda text: process_text(text, stemmer, stop_words, language))
    
    print(f"Data has been successfully tokenized for language: {language}")
    
    # save the new data as CSV
    output_path = f"Data/test/bm25_{language}_{mytype}.csv"
    new_df.to_csv(output_path, index=False)
    print(f"New data saved as CSV at {output_path}!")
    
    return new_df

### Tokenization


In [None]:
# Tokenization
CORPUS_PATH = f"{lang}_corpus_filtered.json"
QUERY_PATH = "Data/dev.csv"
corpus = pd.read_json(CORPUS_PATH)
query = pd.read_csv(QUERY_PATH)

# Restrict the data to the wanted language defined above
corpus_data = corpus[corpus["lang"] == lang]
query_data = query[query["lang"] == lang]

# Tokenization
corpus_data = tokenizer(corpus_data , "text", lang, "corpus")
query_data = tokenizer(query_data, "query", lang, "query")

#### Check the results here

We will use 'text_token' column for the following work

In [11]:
lang = 'es'
corpus_data = pd.read_csv(f"Data/preprocess_corpus/bm25corpus_{lang}.csv")
query_data = pd.read_csv(f"Data/preprocess_query/bm25query_{lang}.csv")

# check the result 

corpus_data.head()

Unnamed: 0.1,Unnamed: 0,docid,text,lang,text_token
0,0,doc-es-214,Beautiful Boy: Siempre serás mi hijo\n\nArgume...,es,"['beautiful', 'boy', 'siempr', 'hij', 'argumen..."
1,1,doc-es-9159,Vautour (buque de 1797)\n\nCorsario francés\n\...,es,"['vautour', 'buqu', '1797', 'corsari', 'france..."
2,2,doc-es-5214,Certificados de Capital de Desarrollo\n\nLos C...,es,"['certific', 'capital', 'desarroll', 'certific..."
3,3,doc-es-1780,Chispazos de tradición\n\nCaracterísticas del ...,es,"['chispaz', 'tradicion', 'caracterist', 'progr..."
4,4,doc-es-9639,"Jericó\n\nToponimia\n\nEn Canaán, en el moment...",es,"['jeric', 'toponimi', 'canaan', 'moment', 'con..."
