##### Dipendensi

In [2]:
import re

import nltk
from nltk.stem import WordNetLemmatizer

# import numpy as np

from datasets import load_dataset

#### Load Data

In [6]:
data = load_dataset("jhu-clsp/jfleg")
print(data)

DatasetDict({
    validation: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 755
    })
    test: Dataset({
        features: ['sentence', 'corrections'],
        num_rows: 748
    })
})


In [7]:
validation = data["validation"]
test = data["test"]

validation_sentence = validation["sentence"]
validation_corrections = validation["corrections"]

test_sentence = test["sentence"]
test_corrections = test["corrections"]

In [12]:
print("Validation sentence : ")
for index, val_sen in enumerate(validation_sentence[:5], start=1): 
    print(f"{index}, {val_sen}")
print("\n")
print("Validation corrections : ")
for i, val_cor in enumerate(validation_corrections[:5], start=1): 
    for j, cor in enumerate(val_cor, start=1):
        print (f"{i}. {j}. {cor}")


Validation sentence : 
1, So I think we can not live if old people could not find siences and tecnologies and they did not developped . 
2, For not use car . 
3, Here was no promise of morning except that we looked up through the trees we saw how low the forest had swung . 
4, Thus even today sex is considered as the least important topic in many parts of India . 
5, image you salf you are wark in factory just to do one thing like pot taire on car if they fire you you will destroy , becouse u dont know more than pot taire in car . 


Validation corrections : 
1. 1. So I think we would not be alive if our ancestors did not develop sciences and technologies . 
1. 2. So I think we could not live if older people did not develop science and technologies . 
1. 3. So I think we can not live if old people could not find science and technologies and they did not develop . 
1. 4. So I think we can not live if old people can not find the science and technology that has not been developed . 
2. 1.

#### 1. Pra-pemrosesan Data

##### 1.1. Pembersihan Data
Data dibersihkan dari karakter yang tidak perlu dibaca, spasi berlebih, dan karakter khusus 

In [3]:
def clean_sentence(sentence):
    cleaned = re.sub(r'[^\w\s]', '', sentence)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

In [32]:
def validation_clean(features):
    clean_features = []
    for words in features:
        clean_features.append(clean_sentence(words))
    return clean_features

def test_clean(features):
    clean_features = []
    temp = []
    for words_1 in features:
        for words_2 in words_1:
            temp.append(clean_sentence(words_2))
        clean_features.append(temp)
        temp = []
    return clean_features

validation_sentence = validation_clean(validation_sentence)
test_sentence = validation_clean(test_sentence)

validation_corrections = test_clean(validation_corrections)
test_corrections = test_clean(test_corrections)


In [33]:
print("Validation sentence (clean) : ")
for index, val_sen in enumerate(validation_sentence[:5], start=1): 
    print(f"{index}, {val_sen}")
print("\n")
print("Validation corrections (clean) : ")
for i, val_cor in enumerate(validation_corrections[:5], start=1): 
    for j, cor in enumerate(val_cor, start=1):
        print (f"{i}. {j}. {cor}")

Validation sentence (clean) : 
1, So I think we can not live if old people could not find siences and tecnologies and they did not developped
2, For not use car
3, Here was no promise of morning except that we looked up through the trees we saw how low the forest had swung
4, Thus even today sex is considered as the least important topic in many parts of India
5, image you salf you are wark in factory just to do one thing like pot taire on car if they fire you you will destroy becouse u dont know more than pot taire in car


Validation corrections (clean) : 
1. 1. So I think we would not be alive if our ancestors did not develop sciences and technologies
1. 2. So I think we could not live if older people did not develop science and technologies
1. 3. So I think we can not live if old people could not find science and technologies and they did not develop
1. 4. So I think we can not live if old people can not find the science and technology that has not been developed
2. 1. Not for use 

##### 1.2. Normalisasi Data
Data diubah menjadi huruf kecil dan dikonversikan semua kata menjadi bentuk dasar.

In [4]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aditt\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\aditt\AppData\Roaming\nltk_data...


True

In [34]:
def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    sentence = sentence.lower()
    kata = sentence.split()
    kata_dasar = [lemmatizer.lemmatize(k) for k in kata]
    return ' '.join(kata_dasar)

In [44]:
def validation_lemmatize(features):
    lemmatize_features = []
    for words in features:
        lemmatize_features.append(lemmatize_sentence(words))
    return lemmatize_features

def test_lemmatize(features):
    lemmatize_features = []
    temp = []
    for words_1 in features:
        for words_2 in words_1:
            temp.append(lemmatize_sentence(words_2))
        lemmatize_features.append(temp)
        temp = []
    return lemmatize_features

validation_sentence = validation_lemmatize(validation_sentence)
test_sentence = validation_lemmatize(test_sentence)

validation_corrections = test_lemmatize(validation_corrections)
test_corrections = test_lemmatize(test_corrections)

In [45]:
print("Validation sentence (lemmatize) : ")
for index, val_sen in enumerate(validation_sentence[:5], start=1): 
    print(f"{index}, {val_sen}")
print("\n")
print("Validation corrections (lemmatize) : ")
for i, val_cor in enumerate(validation_corrections[:5], start=1): 
    for j, cor in enumerate(val_cor, start=1):
        print (f"{i}. {j}. {cor}")

Validation sentence (lemmatize) : 
1, so i think we can not live if old people could not find siences and tecnologies and they did not developped
2, for not use car
3, here wa no promise of morning except that we looked up through the tree we saw how low the forest had swung
4, thus even today sex is considered a the least important topic in many part of india
5, image you salf you are wark in factory just to do one thing like pot taire on car if they fire you you will destroy becouse u dont know more than pot taire in car


Validation corrections (lemmatize) : 
1. 1. so i think we would not be alive if our ancestor did not develop science and technology
1. 2. so i think we could not live if older people did not develop science and technology
1. 3. so i think we can not live if old people could not find science and technology and they did not develop
1. 4. so i think we can not live if old people can not find the science and technology that ha not been developed
2. 1. not for use with 

##### 1.3. Tokenisasi
Dilakukan tokenisasi kepada data dengan memecah kalimat ke dalam per-kata, dan dilakukan pemberian id khusus per-kata
khusus

In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aditt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [49]:
def indeks_word_generator(list_words):
    kata_unik = list(set(list_words))
    indeks_kata = {kata: idx + 1 for idx, kata in enumerate(kata_unik)}
    return indeks_kata

In [50]:
def unique_word_generator(val_sentence, val_corrections, test_sentence, test_corrections):
    all_words = []
    
    for sentence in val_sentence + test_sentence:
        all_words.extend(sentence.split())
        
    for corrections in val_corrections + test_corrections:
        for sentence in corrections:
            all_words.extend(sentence.split())
    
    unique_words = list(set(all_words))
    return unique_words

In [51]:
unique_word = unique_word_generator(validation_sentence, validation_corrections, test_sentence, test_corrections)
dictionary = indeks_word_generator(unique_word)

In [52]:
print (dictionary)

