In [1]:
import csv
import ast

from tqdm import tqdm

### CSV Functions

In [15]:
def load_csv_data(file_path, bool_params):
    # Initialize an empty list to store the data
    data_list = []

    # Open the CSV file for reading
    with open(file_path, newline='', encoding="utf-8") as csvfile:
        # Create a CSV reader object
        csv_reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV file
        for row in csv_reader:
            # Append the row (as a dictionary) to the data_list
            row["choices"] = ast.literal_eval(row["choices"])

            for param in bool_params:
                if row[param].lower() == "true":
                    row[param] = True
                elif row[param].lower() == "false":
                    row[param] = False
                else:
                    raise TypeError(f"{param} data cannot be recognized")

            data_list.append(row)
    
    return data_list

def load_all_rephrase_data(split, dir_path, file_name):
    data = {}
    
    for s in split:
        file_path = f"{dir_path}/raw_{s}{file_name}"
        # data[s] = load_csv_data(file_path, [])
        # data[s] = load_csv_data(file_path, ["concept", "name", "option"])
        data[s] = load_csv_data(file_path, ["su_id_decision", "id_concept_appearance", "su_concept_appearance"])
    return data

def save_data(samples, file_path):
    # Get the keys from the first dictionary
    header = samples[0].keys()

    # Write the data to the CSV file
    with open(file_path, 'w', newline='', encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for row in samples:
            writer.writerow(row)

    print(f'CSV file "{file_path}" has been created with the data.')

### Filter Functions

In [16]:
from sentence_transformers import SentenceTransformer, util
minilm_model = SentenceTransformer('all-MiniLM-L12-v2')
multilingual_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def compute_similarity(text1, text2, multilingual=False):
    if multilingual:
        model = multilingual_model
    else:
        model = minilm_model

    embeddings1 = model.encode([text1], convert_to_tensor=True)
    embeddings2 = model.encode([text2], convert_to_tensor=True)

    cosine_scores = util.cos_sim(embeddings1, embeddings2)

    return float(cosine_scores[0][0])

In [17]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from better_profanity import profanity
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

en_stemmer = PorterStemmer()
id_stemmer =  StemmerFactory().create_stemmer()

def get_input_text(item):
    return " ".join([item['question'], item['question_concept']] + item['choices']['text'])

def filter_concept(text, lang="english"):
    # Step 1: Lowercase both question and question_concept
    question = text["question"].lower()
    question_concept = text["question_concept"].lower()
    
    # Step 2: Check if question_concept appears in question
    if question_concept in question:
        return True

    # Step 3: If not, split and remove stopwords
    if lang != "sundanese":
        stop_words = stopwords.words(lang)
    else:
        stop_words = []

    concept_words = question_concept.split()
    concept_words = [word for word in concept_words if word not in stop_words]

    # Check if any of the remaining words in question_concept appear in question
    if any(word in question for word in concept_words):
        return True

    # Step 4: Stem words and check if any stem word appears in question
    
    if lang != "sundanese":
        if lang == "english":
            stemmer = en_stemmer
        elif lang == "indonesian":
            stemmer = id_stemmer
        
        question_stemmed = " ".join(stemmer.stem(word) for word in question.split())
        if any(word in question_stemmed for word in [stemmer.stem(w) for w in concept_words]):
            return True

    # Step 5: If none of the above conditions met, return False
    return False

def filter_profanity(text):
    all_texts = get_input_text(text)
    
    return not profanity.contains_profanity(all_texts)

In [18]:
split = ["validation", "test", "train"]
# en_data = load_all_rephrase_data(split, "92123", "_rephrased_name_92123.csv")
id_data = load_all_rephrase_data(split, "v3-gpt4-1106/id", ".csv")
su_data = load_all_rephrase_data(split, "v3-gpt4-1106/su", ".csv")

# id_en_data = load_all_rephrase_data(split, "backtranslation/id_en", ".csv")
# su_en_data = load_all_rephrase_data(split, "backtranslation/su_en", ".csv")
# su_id_data = load_all_rephrase_data(split, "v3-gpt4-1106/su_id", ".csv")

KeyError: 'id_concept_appearance'

In [8]:
# id_en_threshold = 0.9
# su_en_threshold = 0.85
su_id_threshold = 0.9

for s in split:
    print(f"Filtering {s} split")

    id_count = 0
    su_count = 0
    # en_count = 0
    # prof_count = 0

    # id_en_count = 0
    # su_en_count = 0
    su_id_count = 0

    for idx, item in tqdm(enumerate(id_data[s])):
        # id_en = compute_similarity(get_input_text(item), get_input_text(id_en_data[s][idx]))
        # su_en = compute_similarity(get_input_text(item), get_input_text(su_en_data[s][idx]))
        su_id = compute_similarity(get_input_text(item), get_input_text(su_id_data[s][idx]), multilingual=True)
        
        id_ca = filter_concept(item, lang="indonesian")
        su_ca = filter_concept(su_data[s][idx], lang="sundanese")

        # id_data[s][idx]["id_en_similarity"] = id_en
        # id_data[s][idx]["id_en_decision"] = bool(id_en >= id_en_threshold)
        # id_data[s][idx]["concept_appearance"] = id_ca

        # su_data[s][idx]["su_en_similarity"] = su_en
        # su_data[s][idx]["su_en_decision"] = bool(su_en >= su_en_threshold)
        su_data[s][idx]["su_id_similarity"] = su_id
        su_data[s][idx]["su_id_decision"] = bool(su_id >= su_id_threshold)
        su_data[s][idx]["concept_appearance"] = su_ca

        # item["id_en_similarity"] = id_en
        # item["id_en_decision"] = bool(id_en >= id_en_threshold)
        item["id_concept_appearance"] = id_ca
        # item["su_en_similarity"] = su_en
        # item["su_en_decision"] = bool(su_en >= su_en_threshold)
        item["su_id_similarity"] = su_id
        item["su_id_decision"] = bool(su_id >= su_id_threshold)
        item["su_concept_appearance"] = su_ca
        # item["concept_appearance"] = filter_concept(item)
        # item["not_contain_profanity"] = filter_profanity(item)
        
        # if not item["concept_appearance"]:
        #     en_count += 1
        if not item["id_concept_appearance"]:
            id_count += 1
        if not item["su_concept_appearance"]:
            su_count += 1
        # if not item["not_contain_profanity"]:
        #     prof_count += 1
        
        # if id_en < id_en_threshold:
        #     id_en_count += 1
        # if su_en < su_en_threshold:
        #     su_en_count += 1
        if su_id < su_id_threshold:
            su_id_count += 1
        
    # print(f"EN Filtered Concept: {en_count}")
    print(f"ID Erased Concept: {id_count}")
    print(f"SU Erased Concept: {su_count}")
    # print(f"ID-EN Filtered Threshold {id_en_threshold}: {id_en_count}")
    # print(f"SU-EN Filtered Threshold {su_en_threshold}: {su_en_count}")
    print(f"SU-ID Erased Threshold {su_id_threshold}: {su_id_count}")
    # print(f"Filtered Profanity: {prof_count}")

    # save_data(en_data[s], f"./filtered_data/en/{s}.csv")
    save_data(id_data[s], f"./v3-gpt4-1106/id/raw_{s}.csv")
    save_data(su_data[s], f"./v3-gpt4-1106/su/raw_{s}.csv")
    print()

Filtering validation split


274it [00:17, 15.56it/s]


ID Erased Concept: 5
SU Erased Concept: 68
SU-ID Erased Threshold 0.9: 60
CSV file "./v3-gpt4-1106/id/raw_validation.csv" has been created with the data.
CSV file "./v3-gpt4-1106/su/raw_validation.csv" has been created with the data.

Filtering test split


236it [00:14, 16.03it/s]


ID Erased Concept: 3
SU Erased Concept: 65
SU-ID Erased Threshold 0.9: 67
CSV file "./v3-gpt4-1106/id/raw_test.csv" has been created with the data.
CSV file "./v3-gpt4-1106/su/raw_test.csv" has been created with the data.

Filtering train split


2162it [02:18, 15.64it/s]

ID Erased Concept: 61
SU Erased Concept: 552
SU-ID Erased Threshold 0.9: 437
CSV file "./v3-gpt4-1106/id/raw_train.csv" has been created with the data.
CSV file "./v3-gpt4-1106/su/raw_train.csv" has been created with the data.






In [19]:
print(id_data["test"][0])

{'id': '90b30172e645ff91f7171a048582eb8b', 'question': 'Rumah susun tersebut sulit dijual oleh agen properti, karena tepat berada di samping gedung apa?', 'question_concept': 'rumah susun', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['perkembangan pinggiran kota', 'gedung apartemen', 'halte bus', 'jakarta', 'pinggiran kota']}, 'answerKey': 'B', 'concept': 'False', 'name': 'False', 'option': 'True', 'id_concept_appearance': True, 'su_id_similarity': '0.8857952356338501', 'su_id_decision': False, 'su_concept_appearance': False}


In [20]:
print(su_data["test"][0])

{'id': '90b30172e645ff91f7171a048582eb8b', 'question': 'Flat ieu hésé pikeun agén properti pikeun dijual, sabab caket sareng gedong naon?', 'question_concept': 'rumah susun', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['pangwangunan suburban', 'wangunan apartemen', 'eureun beus', 'Jakarta', 'suburbs']}, 'answerKey': 'B', 'su_id_similarity': '0.8857952356338501', 'su_id_decision': 'False', 'concept_appearance': 'False'}


In [21]:
for s in split:
    # en_filtered = []
    id_filtered = []
    su_filtered = []
    for idx, item in tqdm(enumerate(id_data[s])):
        if item["su_id_decision"] and item["id_concept_appearance"] and item["su_concept_appearance"]: # and item["not_contain_profanity"] and item["su_en_decision"] and item["id_en_decision"] and item["concept_appearance"]:
            id_filtered.append({
                'id': item['id'],
                'question': item['question'],
                'question_concept': item['question_concept'],
                'choices': {
                    'label': item['choices']['label'],
                    'text': item['choices']['text']
                },
                'answerKey': item['answerKey']
            })
            # id_filtered.append({
            #     'id': id_data[s][idx]['id'],
            #     'question': id_data[s][idx]['question'],
            #     'question_concept': id_data[s][idx]['question_concept'],
            #     'choices': {
            #         'label': id_data[s][idx]['choices']['label'],
            #         'text': id_data[s][idx]['choices']['text']
            #     },
            #     'answerKey': id_data[s][idx]['answerKey']
            # })
            su_filtered.append({
                'id': su_data[s][idx]['id'],
                'question': su_data[s][idx]['question'],
                'question_concept': su_data[s][idx]['question_concept'],
                'choices': {
                    'label': su_data[s][idx]['choices']['label'],
                    'text': su_data[s][idx]['choices']['text']
                },
                'answerKey': su_data[s][idx]['answerKey']
            })
    print(f"Count Remaining Data for {s}: {len(id_filtered)}")
    # save_data(en_filtered, f"./filtered_data/en/{s}.csv")
    save_data(id_filtered, f"./v3-gpt4-1106/id/filtered_{s}.csv")
    save_data(su_filtered, f"./v3-gpt4-1106/su/filtered_{s}.csv")

274it [00:00, 274477.98it/s]


Count Remaining Data for validation: 163
CSV file "./v3-gpt4-1106/id/filtered_validation.csv" has been created with the data.
CSV file "./v3-gpt4-1106/su/filtered_validation.csv" has been created with the data.


236it [00:00, 235175.99it/s]


Count Remaining Data for test: 130
CSV file "./v3-gpt4-1106/id/filtered_test.csv" has been created with the data.
CSV file "./v3-gpt4-1106/su/filtered_test.csv" has been created with the data.


2162it [00:00, 432513.84it/s]

Count Remaining Data for train: 1299
CSV file "./v3-gpt4-1106/id/filtered_train.csv" has been created with the data.
CSV file "./v3-gpt4-1106/su/filtered_train.csv" has been created with the data.



