In [1]:
import pandas as pd
from transformers import DistilBertTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Definisikan path file
file_path = "/content/drive/MyDrive/Google Colab TA/Dataset/stemmed.txt"

try:
    # Metode 1: Membaca file menggunakan pandas secara langsung
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8', on_bad_lines='skip')  # Coba separator lain jika perlu

except pd.errors.ParserError:
    try:
        # Metode 2: Membaca baris demi baris untuk menangani baris yang salah format
        data = []
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            for line in file:
                try:
                    # Misalnya, asumsi data dipisahkan dengan tab
                    parts = line.strip().split('\t')  # Sesuaikan delimiter jika perlu
                    if len(parts) == 2:  # Memastikan jumlah kolom sesuai
                        data.append(parts)
                except Exception as e:
                    print(f"Skipping malformed line: {line.strip()}, Error: {e}")
        df = pd.DataFrame(data, columns=['col1', 'col2'])  # Ganti col1, col2 dengan nama kolom Anda

    except Exception as e:
        print(f"Error reading file: {e}")
        # Buat DataFrame kosong jika gagal
        df = pd.DataFrame()

In [5]:
# Membaca dataset dengan delimiter yang berbeda (contoh: ' ||| ')
data = []
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Memisahkan data berdasarkan delimiter
            parts = line.strip().split(' ||| ')

            # Memastikan hanya baris dengan dua kolom yang diproses
            if len(parts) == 2:
                data.append(parts)
            elif len(parts) > 2:
                # Jika lebih dari dua kolom, gabungkan sisanya ke kolom 'text'
                data.append([parts[0], ' ||| '.join(parts[1:])])

    # Konversi ke DataFrame
    df = pd.DataFrame(data, columns=['title', 'text'])

    # Menampilkan 5 data pertama untuk validasi
    print(df.head())

except Exception as e:
    print(f"Error processing file: {e}")

     title                                               text
0   anarch  anarch polit philosophi advoc self-govern soci...
1   autism  autism neurodevelopment disord character impai...
2   albedo  albedo measur reflect optic bright latin albed...
3  alabama  alabama state southeastern region unit state b...
4    achil  greek mytholog achil uh-kill-eez greek ἀχιλλεύ...


In [6]:
# 2. Membersihkan Data
# Menghapus baris duplikat
df.drop_duplicates(inplace=True)

# Menghapus baris dengan nilai kosong
df.dropna(subset=['text'], inplace=True)e

In [7]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
# 3. Lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [9]:
# Menambahkan kolom untuk kategori elektronik
keywords = [
    "android", "automation", "ai speaker", "alexa", "assistant",
    "base station", "buzzer", "cartridge", "charging cable",
    "controller board", "docking station", "dsp", "dvd player",
    "emulator", "energy saver", "fiber optic", "game console",
    "gps", "graphics processor", "hdmi", "hd", "high fidelity",
    "iot", "iot device", "kinect", "lcd", "led tv", "li-polymer",
    "machine learning", "media player", "microprocessor",
    "nano board", "oscillator", "optical drive", "piezo",
    "power cord", "power supply", "projector screen",
    "radio", "receiver", "remote control", "rgb",
    "robotics", "satellite", "set-top box", "signal processor",
    "smart home", "smart device", "solar panel", "solid state",
    "sound bar", "surveillance", "switch", "synthesizer",
    "thermometer", "thunderbolt", "trackpad", "ups", "usb-c",
    "vga", "video card", "video processor", "voice assistant",
    "voltage regulator", "wearable device", "wifi extender",
    "wire", "wireless router", "workstation computer",
    "z-wave", "zigbee", "ac adapter", "amplifier",
    "bluetooth speaker", "cable adapter", "circuit board",
    "connector", "data storage", "digital clock",
    "electronic component", "ethernet cable", "gaming headset",
    "hard drive", "headphones", "infrared sensor", "input device",
    "lithium battery", "microcontroller", "modem", "monitor",
    "motion sensor", "network card", "network switch", "pcb board",
    "pc gaming", "power adapter", "ram", "sd card",
    "security camera", "server", "smart plug", "smart thermostat",
    "speaker system", "ssd", "tablet", "touch screen",
    "tv tuner", "usb adapter", "vr headset",
    "wireless keyboard", "wireless mouse", "workstation", "zip drive"
]

# Fungsi untuk memeriksa apakah teks mengandung kata kunci
def contains_keywords(text, keywords):
    return any(keyword in text.lower() for keyword in keywords)

# Menambahkan kolom 'is_electronics' untuk menandai entri yang relevan
df['is_electronics'] = df['text'].apply(lambda x: contains_keywords(x, keywords))

# Menghitung jumlah data yang relevan dengan elektronik
count_electronics = df['is_electronics'].sum()

# Menampilkan jumlah entri yang relevan
print(f"Jumlah entri yang terkait dengan elektronik atau sinonimnya: {count_electronics}")

# Opsional: Tampilkan entri yang relevan
print(df[df['is_electronics']].head())

Jumlah entri yang terkait dengan elektronik atau sinonimnya: 736350
           title                                               text  \
1         autism  autism neurodevelopment disord character impai...   
2         albedo  albedo measur reflect optic bright latin albed...   
6       aristotl  aristotl greek ἀριστοτέλης pronounc aristotélɛ...   
7  american pari  american pari jazz-influenc orchestr piec amer...   
9  academi award  academi award known offici oscar set twenty-fo...   

   is_electronics  
1            True  
2            True  
6            True  
7            True  
9            True  


In [10]:
# Menyeimbangkan Data (Opsional jika ketidakseimbangan terjadi)
data_electronics = df[df['is_electronics']]
data_non_electronics = df[~df['is_electronics']]

# Duplicate data elektronik hingga jumlahnya setara dengan data lainnya
balanced_data = pd.concat([data_electronics, data_non_electronics.sample(len(data_electronics), replace=True)])

In [11]:
# Membagi dataset untuk training dan testing
train_data, test_data = train_test_split(balanced_data, test_size=0.2, random_state=42)

In [13]:
# Menambahkan kolom 'fasttext_label' ke kedua dataset: train_data dan test_data
train_data['fasttext_label'] = train_data['is_electronics'].apply(lambda x: '__label__electronics' if x else '__label__none')
test_data['fasttext_label'] = test_data['is_electronics'].apply(lambda x: '__label__electronics' if x else '__label__none')

# Menyimpan data ke file teks untuk pelatihan FastText
train_data[['fasttext_label', 'text']].to_csv('/content/drive/MyDrive/Google Colab TA/fasttext_train.txt', index=False, sep=' ', header=False)
test_data[['fasttext_label', 'text']].to_csv('/content/drive/MyDrive/Google Colab TA/fasttext_test.txt', index=False, sep=' ', header=False)

print("Data untuk FastText telah berhasil disimpan.")

Data untuk FastText telah berhasil disimpan.


In [15]:
# Tokenisasi Teks dengan Tokenizer BERT
# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Fungsi tokenisasi
def tokenize_text(text):
    return tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors='pt')

# Tokenisasi data training dan testing
def preprocess_bert_data(df):
    df['tokenized'] = df['text'].apply(tokenize_text)
    return df

train_data = preprocess_bert_data(train_data)
test_data = preprocess_bert_data(test_data)

train_data.to_pickle('/content/drive/MyDrive/Google Colab TA/distilbert_train_compressed.pkl', compression='gzip')
test_data.to_pickle('/content/drive/MyDrive/Google Colab TA/distilbert_test_compressed.pkl', compression='gzip')

print("Proses data preparation selesai dengan kompresi.")

Proses data preparation selesai dengan kompresi.
