# Scientia-Model


## Import Libraries and Configuration


In [None]:
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from mpstemmer import MPStemmer
from polars import col, read_csv, String
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

file_path_dataset = "temp/repository_pnj_20212023.csv"
file_path_select_feature = "temp/01_select_feature.csv"
file_path_lowercase = "temp/02_lowercase.csv"
file_path_remove_empty_abstract = "temp/03_remove_empty_abstract.csv"
file_path_remove_dash_abstract = "temp/04_remove_dash_abstract.csv"
file_path_remove_same_title_abstract = "temp/05_remove_same_title_abstract.csv"
file_path_merge = "temp/06_merge.csv"
file_path_remove_non_alphabet = "temp/07_remove_non_alphabet.csv"
file_path_unique = "temp/08_unique.csv"
file_path_add_lang = "temp/09_add_lang.csv"
file_path_filter_lang = "temp/10_filter_lang.csv"
file_path_stem = "temp/11_stem.csv"
file_path_remove_stopwords = "temp/12_remove_stopwords.csv"
file_path_dataset_clean = "temp/repository_pnj_20212023clean.csv"


def print_file_table(output):
    print(f"./{output}", read_csv(output))

## Preprocessing


### 1. Select Features


In [None]:
read_csv(file_path_dataset).columns

In [None]:
input = file_path_dataset
output = file_path_select_feature

read_csv(input).with_columns(
    [col("title").alias("f1"), col("abstract").alias("f2")]
).write_csv(output)

print_file_table(output)

### 2. Lowercase


In [None]:
input = file_path_select_feature
output = file_path_lowercase

read_csv(input).with_columns(
    [
        col("f1").str.to_lowercase(),
        col("f2").str.to_lowercase(),
    ]
).write_csv(output)

print_file_table(output)

### 3. Remove Missing Values


#### 3.1. Remove Empty Abstract


In [None]:
input = file_path_lowercase
output = file_path_remove_empty_abstract

read_csv(input).with_columns(col("f2").replace("", None)).drop_nulls().write_csv(output)

print_file_table(output)

#### 3.2. Remove Dash Abstract


In [None]:
input = file_path_remove_empty_abstract
output = file_path_remove_dash_abstract

read_csv(input).with_columns(col("f2").replace("-", None)).drop_nulls().write_csv(
    output
)

print_file_table(output)

#### 3.3. Remove Same Title Abstract


In [None]:
input = file_path_remove_dash_abstract
output = file_path_remove_same_title_abstract

read_csv(input).filter(col("f1") != col("f2")).write_csv(output)

print_file_table(output)

### 4. Merge Title Abstract


In [None]:
input = file_path_remove_same_title_abstract
output = file_path_merge

read_csv(input).with_columns((col("f1") + " " + col("f2")).alias("f")).drop(
    ["f1", "f2"]
).write_csv(output)

print_file_table(output)

### 5. Remove Non Alphabetical Characters


In [None]:
input = file_path_merge
output = file_path_remove_non_alphabet

read_csv(input).with_columns(col("f").str.replace_all(r"[^a-z]+", " ")).write_csv(
    output
)

print_file_table(output)

### 6. Remove Duplicate


In [None]:
input = file_path_remove_non_alphabet
output = file_path_unique

read_csv(input).unique("f").write_csv(output)

print_file_table(output)

### 7. Filter Based on Language


#### 7.1. Add Column Language


In [None]:
input = file_path_unique
output = file_path_add_lang


def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"


read_csv(input).with_columns(
    col("f").map_elements(detect_language, return_dtype=String).alias("lang")
).write_csv(output)

print_file_table(output)

#### 7.2. Remove Non-"Bahasa Indonesia"


In [None]:
input = file_path_add_lang
output = file_path_filter_lang

read_csv(input).filter(col("lang") == "id").drop("lang").write_csv(output)

print_file_table(output)

### 8. Stemming


In [None]:
input = file_path_filter_lang
output = file_path_stem


def stem_text(text):
    return MPStemmer().stem_kalimat(text)


read_csv(input).with_columns(
    col("f").map_elements(stem_text, return_dtype=String)
).write_csv(output)

print_file_table(output)

### 9. Remove Stopwords


In [None]:
input = file_path_stem
output = file_path_remove_stopwords


def remove_stopwords(text):
    return StopWordRemoverFactory().create_stop_word_remover().remove(text)


read_csv(input).with_columns(
    col("f").map_elements(remove_stopwords, return_dtype=String)
).write_csv(output)

print_file_table(output)

### 10. Sort


In [None]:
input = file_path_remove_stopwords
output = file_path_dataset_clean

read_csv(input).sort("url").write_csv(output)

print_file_table(output)