In [1]:
# Test Data Loading Integration
import sys  # impor modul sistem
import os   # impor modul operating system
sys.path.append('..')  # tambah parent directory ke path

from dataLoader import loadCsvData  # impor fungsi load data CSV
from matching import matchIntent    # impor fungsi matching intent

print("Testing Integrated Data Loading")  # mulai testing integrasi data loading

# load data dari CSV files
csvData = loadCsvData()  # panggil fungsi load data
print(f"Successfully loaded {len(csvData)} entries from CSV files")  # laporan data berhasil dimuat

# tampilkan distribusi data berdasarkan sumber
dataSources = {}  # dictionary untuk hitung sumber
for entry in csvData:  # iterasi setiap entry data
    sourceFile = entry['source']  # ambil nama sumber
    dataSources[sourceFile] = dataSources.get(sourceFile, 0) + 1  # hitung jumlah per sumber

print(f"\nData distribution by source:")  # header distribusi sumber
for sourceFile, count in dataSources.items():  # tampilkan setiap sumber
    print(f"  {sourceFile}: {count} entries")  # info jumlah per sumber

# tampilkan sample entries
print(f"\nSample entries:")  # header sample entries
for i, entry in enumerate(csvData[:5]):  # ambil 5 entry pertama
    print(f"{i+1}. [{entry['source']}] {entry['content'][:80]}...")  # tampilkan preview content

print(f"\nTesting Matching Function")  # mulai testing fungsi matching

# test berbagai query
testQueries = [  # daftar query untuk testing
    "Apa itu ITB?",           # pertanyaan umum tentang ITB
    "Fakultas di ITB",        # pertanyaan tentang fakultas
    "Mahasiswa ITB",          # pertanyaan tentang mahasiswa
    "Sejarah ITB",            # pertanyaan tentang sejarah
    "Program studi"           # pertanyaan tentang program studi
]

for query in testQueries:  # test setiap query
    print(f"\nQuery: '{query}'")  # tampilkan query yang ditest
    matchResult = matchIntent(query)  # panggil fungsi matching
    if matchResult:  # kalau ada hasil
        print(f"Answer: {matchResult[:100]}...")  # tampilkan preview jawaban
    else:
        print(f"No answer found")  # tidak ada jawaban ditemukan

Testing Integrated Data Loading
📂 Loading enhanced dataset: itb_chatbot_high_quality_20250621_190153.csv
✅ Loaded 382 high-quality entries
📊 Categories: 9
⭐ Avg quality: 74.3/100
Successfully loaded 382 entries from CSV files

Data distribution by source:
  multikampus: 18 entries
  tentang: 19 entries
  wikipedia: 345 entries

Sample entries:
1. [multikampus] PersonalStafMahasiswaMitraAlumniPengunjungPers...
2. [multikampus] Tentang ITBSejarahVisi dan MisiTugas dan FungsiPimpinanLandasan HukumStruktur Or...
3. [multikampus] PenerimaanSarjanaPascasarjanaProfesiPertukaran MahasiswaKelas Internasional...
4. [multikampus] PendidikanFakultas dan SekolahProgram StudiKelompok Keahlian/KeilmuanBeasiswaSta...
5. [multikampus] KampusAktivitas MahasiswaJelajah...

Testing Matching Function

Query: 'Apa itu ITB?'
[MATCHING] matchIntent called with: 'Apa itu ITB?'
[MATCHING] Starting match for query: 'Apa itu ITB?'
📂 Loading enhanced dataset: itb_chatbot_high_quality_20250621_190153.csv
✅ Loaded 3

# Preprocessing Data CSV dari Scraping
Notebook ini akan memproses data mentah dari scraping menjadi format yang siap digunakan untuk matching intent

In [None]:
import sys  # impor sistem
import os   # impor operating system
sys.path.append(os.path.abspath('..'))  # tambah parent directory ke path

import pandas as pd  # impor pandas untuk manipulasi data
import glob          # impor glob untuk file pattern matching

# coba import preprocessing module
try:
    from preprocessing import preprocess  # impor fungsi preprocess
    preprocessingAvailable = True  # flag preprocessing tersedia
    print("Preprocessing module loaded successfully")  # konfirmasi import berhasil
except ImportError:
    preprocessingAvailable = False  # flag preprocessing tidak tersedia
    print("Preprocessing module not available")  # peringatan import gagal

# load semua data CSV dari direktori database
csvFileList = glob.glob('../database/data/*.csv')  # cari semua file CSV
print(f"Files found: {csvFileList}")  # tampilkan file yang ditemukan

# gabungkan semua data dari file CSV
allDataframes = []  # list untuk menyimpan semua dataframe
for csvFile in csvFileList:  # proses setiap file CSV
    try:
        # cek apakah file memiliki konten
        if os.path.getsize(csvFile) == 0:  # kalau file kosong
            print(f"Skipping empty file: {csvFile}")  # skip file kosong
            continue
            
        dataFrame = pd.read_csv(csvFile)  # baca file CSV
        
        # cek apakah dataframe memiliki data
        if dataFrame.empty:  # kalau dataframe kosong
            print(f"Skipping file with no data: {csvFile}")  # skip file tanpa data
            continue
            
        dataFrame['source'] = os.path.basename(csvFile)  # tambah kolom sumber
        allDataframes.append(dataFrame)  # tambah dataframe ke list
        print(f"Successfully loaded {len(dataFrame)} rows from {csvFile}")  # laporan berhasil load
        
    except pd.errors.EmptyDataError:  # handle error file kosong
        print(f"Skipping file with no columns to parse: {csvFile}")  # skip file tanpa kolom
        continue
    except Exception as e:  # handle error lainnya
        print(f"Error loading {csvFile}: {str(e)}")  # laporan error
        continue

print(f"\nTotal datasets loaded: {len(allDataframes)}")  # total dataset yang dimuat
if allDataframes:  # kalau ada data yang dimuat
    totalRowCount = sum(len(df) for df in allDataframes)  # hitung total rows
    print(f"Total rows across all datasets: {totalRowCount}")  # laporan total rows
    
    # gabungkan semua dataframes
    combinedDataframe = pd.concat(allDataframes, ignore_index=True)  # gabung semua dataframe
    print(f"Combined dataframe shape: {combinedDataframe.shape}")  # info shape gabungan
    print(f"Columns: {list(combinedDataframe.columns)}")  # info kolom yang ada
else:
    print("No valid data found in CSV files")  # tidak ada data valid
    combinedDataframe = pd.DataFrame()  # buat dataframe kosong

Files found: ['../database/data\\eventITB.csv', '../database/data\\hasilseleksiITB.csv', '../database/data\\multikampusITB.csv', '../database/data\\tentangITB.csv', '../database/data\\wikipediaITB.csv']
Skipping empty file: ../database/data\eventITB.csv
Skipping empty file: ../database/data\hasilseleksiITB.csv
Successfully loaded 175 rows from ../database/data\multikampusITB.csv
Successfully loaded 188 rows from ../database/data\tentangITB.csv
Successfully loaded 1005 rows from ../database/data\wikipediaITB.csv

Total datasets loaded: 3
Total rows across all datasets: 1368
Combined dataframe shape: (1368, 4)
Columns: ['type', 'content', 'links', 'source']


In [None]:
# filter data yang berguna untuk chatbot
print("Filtering useful data for chatbot...")  # info mulai filtering

# cek apakah combinedDataframe tersedia dan tidak kosong
if 'combinedDataframe' in locals() and not combinedDataframe.empty:  # kalau ada data gabungan
    # ambil content yang tidak kosong dan bermakna
    filteredUsefulData = combinedDataframe[
        (combinedDataframe['content'].notna()) &  # content tidak null
        (combinedDataframe['content'].str.len() > 3) &  # panjang content minimal 4 karakter
        (~combinedDataframe['content'].str.contains(r'^(li|div|span)$', na=False))  # bukan tag HTML
    ].copy()  # buat copy dataframe hasil filter
    
    print(f"Useful records: {len(filteredUsefulData)}")  # laporan jumlah data berguna
    
    # tampilkan preview data yang berguna
    if len(filteredUsefulData) > 0:  # kalau ada data berguna
        print(f"\nPreview of useful data:")  # header preview
        previewData = filteredUsefulData[['content', 'source']].head(10)  # ambil 10 data pertama
        
        # tampilkan data dengan format yang rapi
        for idx, row in previewData.iterrows():  # iterasi setiap row
            contentPreview = row['content'][:100] + '...' if len(row['content']) > 100 else row['content']  # potong content kalau panjang
            print(f"  {idx+1}. [{row['source']}] {contentPreview}")  # tampilkan content dengan sumber
            
        print(f"\nData distribution by source:")  # distribusi per sumber
        sourceDistribution = filteredUsefulData['source'].value_counts()  # hitung distribusi sumber
        for sourceFile, recordCount in sourceDistribution.items():  # tampilkan setiap sumber
            print(f"  {sourceFile}: {recordCount} records")  # info jumlah per sumber
    else:
        print("No useful data found after filtering")  # tidak ada data berguna
        
else:
    print("No combined data available for filtering")  # tidak ada data gabungan
    filteredUsefulData = pd.DataFrame()  # buat dataframe kosong

Useful records: 1325


  (~combined_df['content'].str.contains(r'^(li|div|span)$', na=False))


Unnamed: 0,content,source
2,Staf,multikampusITB.csv
3,Mahasiswa,multikampusITB.csv
4,Alumni,multikampusITB.csv
5,Mitra,multikampusITB.csv
6,Pengunjung,multikampusITB.csv
7,Pers,multikampusITB.csv
8,My ITB,multikampusITB.csv
9,Admission,multikampusITB.csv
10,Kontak,multikampusITB.csv
11,Cari,multikampusITB.csv


In [None]:
import sys
import os
sys.path.append(os.path.abspath('../..'))

In [None]:
import pandas as pd
from machinelearning import preprocessing
from matching import matchWithCsvData, matchIntent, jaccardSimilarity
from preprocessing import preprocess
import os  # pastikan os diimport

# Load data dari file yang tersedia
csvFilePath = '../database/data/wikipedia_itb_full.csv'  # gunakan file yang benar
if os.path.exists(csvFilePath):  # cek apakah file ada
    dataFrame = pd.read_csv(csvFilePath)  # baca file CSV
    print(f"Loaded {len(dataFrame)} rows from {csvFilePath}")  # laporan jumlah data
    print(f"Columns: {list(dataFrame.columns)}")  # info kolom
    print("\nSample data:")  # header sample data
    print(dataFrame.head())  # tampilkan 5 baris pertama
else:
    print(f"File not found: {csvFilePath}")  # file tidak ditemukan
    # gunakan file alternatif
    alternativeFiles = [
        '../database/data/tentang_itb_full.csv',
        '../database/data/multikampus_full.csv',
        '../database/data/event_itb.csv'
    ]
    
    for altFile in alternativeFiles:  # cek file alternatif
        if os.path.exists(altFile):  # kalau file ada
            dataFrame = pd.read_csv(altFile)  # baca file
            print(f"Using alternative file: {altFile}")  # info file alternatif
            print(f"Loaded {len(dataFrame)} rows")  # laporan jumlah data
            break
    else:
        print("No CSV files found")  # tidak ada file ditemukan
        dataFrame = pd.DataFrame()  # buat dataframe kosong

# Test preprocessing dan algoritma
testQueryList = [  # daftar query untuk testing
    "Apa itu ITB?",
    "Sejarah Institut Teknologi Bandung",
    "Fakultas di ITB",
    "Cara mendaftar ke ITB",
    "Lokasi kampus ITB"
]

print("\nTesting Algoritma Pencarian")  # header testing algoritma
for queryText in testQueryList:  # test setiap query
    print(f"\nQuery: {queryText}")  # tampilkan query
    
    # Test preprocessing
    processedQueryText = preprocess(queryText)  # proses query
    print(f"Processed: {processedQueryText}")  # hasil preprocessing
    
    # Test matching algorithm
    try:
        matchingResult = matchWithCsvData(queryText, threshold=0.3, topK=2)  # test matching
        if matchingResult:  # kalau ada hasil
            print(f"Match result: {matchingResult[:200]}...")  # tampilkan hasil matching
        else:
            print("No match found")  # tidak ada hasil
    except Exception as e:  # handle error
        print(f"Matching error: {e}")  # laporan error matching
    
    # Test intent matching
    try:
        intentMatchResult = matchIntent(queryText)  # test intent matching
        if intentMatchResult:  # kalau ada hasil
            print(f"Intent result: {intentMatchResult[:200]}...")  # tampilkan hasil intent
        else:
            print("No intent match found")  # tidak ada hasil intent
    except Exception as e:  # handle error
        print(f"Intent error: {e}")  # laporan error intent
        
print("\nTesting Jaccard Similarity")  # header testing jaccard
testTextOne = "Institut Teknologi Bandung"  # teks pertama
testTextTwo = "ITB Bandung"  # teks kedua
similarityScore = jaccardSimilarity(testTextOne, testTextTwo)  # hitung similarity
print(f"Jaccard similarity between '{testTextOne}' and '{testTextTwo}': {similarityScore}")  # laporan similarity


Loaded 1005 rows from ../database/data/wikipediaITB.csv
Columns: ['type', 'content', 'links']

Sample data:
  type            content                                  links
0   li      Halaman Utama                    /wiki/Halaman_Utama
1   li         Daftar isi                    /wiki/Wikipedia:Isi
2   li  Perubahan terbaru       /wiki/Istimewa:Perubahan_terbaru
3   li    Artikel pilihan  /wiki/Wikipedia:Artikel_pilihan/Topik
4   li  Peristiwa terkini         /wiki/Portal:Peristiwa_terkini

=== Testing Algoritma Pencarian ===

Query: Apa itu ITB?
Processed: apa itb
[MATCHING] Starting match for query: 'Apa itu ITB?'
[MATCHING] Processed query: 'apa itb'
[MATCHING] Found 28 candidates
[MATCHING] Best match: Tentang ITB... (score: 0.30, methods: ['jaccard(0.50)'])
Match result: Tentang ITB....
[MATCHING] matchIntent called with: 'Apa itu ITB?'
[MATCHING] Starting match for query: 'Apa itu ITB?'
[MATCHING] Processed query: 'apa itb'
[MATCHING] Found 28 candidates
[MATCHING] Best match: