In [8]:
# Test Data Loading Integration
import sys
import os
sys.path.append('..')

from dataLoader import load_csv_data
from matching import matchIntent

print("=== TESTING INTEGRATED DATA LOADING ===")

# Load data
data = load_csv_data()
print(f"Successfully loaded {len(data)} entries from CSV files")

# Show data distribution by source
sources = {}
for entry in data:
    source = entry['source']
    sources[source] = sources.get(source, 0) + 1

print("\nData distribution by source:")
for source, count in sources.items():
    print(f"  {source}: {count} entries")

# Show sample entries
print("\nSample entries:")
for i, entry in enumerate(data[:5]):
    print(f"{i+1}. [{entry['source']}] {entry['content'][:80]}...")

print("\n=== TESTING MATCHING FUNCTION ===")

# Test different queries
test_queries = [
    "Apa itu ITB?",
    "Fakultas di ITB",
    "Mahasiswa ITB", 
    "Sejarah ITB",
    "Program studi"
]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    result = matchIntent(query)
    print(f"Answer: {result[:100]}...")

=== TESTING INTEGRATED DATA LOADING ===
Error loading hasilseleksiITB.csv: No columns to parse from file
Loaded 1299 data entries from CSV files
Successfully loaded 1299 entries from CSV files

Data distribution by source:
  tentangITB: 173 entries
  wikipediaITB: 977 entries
  multikampusITB: 149 entries

Sample entries:
1. [tentangITB] Mahasiswa...
2. [tentangITB] Alumni...
3. [tentangITB] Mitra...
4. [tentangITB] Pengunjung...
5. [tentangITB] My ITB...

=== TESTING MATCHING FUNCTION ===

Query: 'Apa itu ITB?'
[MATCHING] matchIntent called with: 'Apa itu ITB?'
[MATCHING] Starting match for query: 'Apa itu ITB?'
[MATCHING] Processed query: 'apa itb'
[MATCHING] Found 28 candidates
[MATCHING] Best match: Tentang ITB... (score: 0.30, methods: ['jaccard(0.50)'])
[MATCHING] Found match: Tentang ITB....
Answer: Tentang ITB....

Query: 'Fakultas di ITB'
[MATCHING] matchIntent called with: 'Fakultas di ITB'
[MATCHING] Starting match for query: 'Fakultas di ITB'
[MATCHING] Processed query: 'fa

# Preprocessing Data CSV dari Scraping
Notebook ini akan memproses data mentah dari scraping menjadi format yang siap digunakan untuk matching intent

In [9]:
import sys
import os
sys.path.append(os.path.abspath('..'))

import pandas as pd
import preprocessing
import glob

# Load semua data CSV
csv_files = glob.glob('../database/data/*.csv')
print("Files found:", csv_files)

# Combine semua data dari file CSV
all_data = []
for file in csv_files:
    try:
        # Check if file has content
        if os.path.getsize(file) == 0:
            print(f"Skipping empty file: {file}")
            continue
            
        df = pd.read_csv(file)
        
        # Check if dataframe has any data
        if df.empty:
            print(f"Skipping file with no data: {file}")
            continue
            
        df['source'] = os.path.basename(file)
        all_data.append(df)
        print(f"Successfully loaded {len(df)} rows from {file}")
        
    except pd.errors.EmptyDataError:
        print(f"Skipping file with no columns to parse: {file}")
        continue
    except Exception as e:
        print(f"Error loading {file}: {str(e)}")
        continue

print(f"\nTotal datasets loaded: {len(all_data)}")
if all_data:
    total_rows = sum(len(df) for df in all_data)
    print(f"Total rows across all datasets: {total_rows}")
    
    # Combine all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"Combined dataframe shape: {combined_df.shape}")
    print(f"Columns: {list(combined_df.columns)}")
else:
    print("No valid data found in CSV files")
    combined_df = pd.DataFrame()  # Create empty dataframe

Files found: ['../database/data\\eventITB.csv', '../database/data\\hasilseleksiITB.csv', '../database/data\\multikampusITB.csv', '../database/data\\tentangITB.csv', '../database/data\\wikipediaITB.csv']
Skipping empty file: ../database/data\eventITB.csv
Skipping empty file: ../database/data\hasilseleksiITB.csv
Successfully loaded 175 rows from ../database/data\multikampusITB.csv
Successfully loaded 188 rows from ../database/data\tentangITB.csv
Successfully loaded 1005 rows from ../database/data\wikipediaITB.csv

Total datasets loaded: 3
Total rows across all datasets: 1368
Combined dataframe shape: (1368, 4)
Columns: ['type', 'content', 'links', 'source']


In [13]:
# Filter data yang berguna untuk chatbot
# Ambil content yang tidak kosong dan bermakna
useful_data = combined_df[
    (combined_df['content'].notna()) & 
    (combined_df['content'].str.len() > 3) &
    (~combined_df['content'].str.contains(r'^(li|div|span)$', na=False))
].copy()

print(f"Useful records: {len(useful_data)}")

# Preview data yang berguna
useful_data[['content', 'source']].head(20)

Useful records: 1325


  (~combined_df['content'].str.contains(r'^(li|div|span)$', na=False))


Unnamed: 0,content,source
2,Staf,multikampusITB.csv
3,Mahasiswa,multikampusITB.csv
4,Alumni,multikampusITB.csv
5,Mitra,multikampusITB.csv
6,Pengunjung,multikampusITB.csv
7,Pers,multikampusITB.csv
8,My ITB,multikampusITB.csv
9,Admission,multikampusITB.csv
10,Kontak,multikampusITB.csv
11,Cari,multikampusITB.csv


In [14]:
import sys
import os
sys.path.append(os.path.abspath('../..')) 

In [21]:
import pandas as pd
from machinelearning import preprocessing
from matching import match_with_csv_data, matchIntent, jaccardSimilarity
from preprocessing import preprocess

# buat wikipedia
# Contoh: Baca data intent/FAQ
# Ganti path dan nama kolom sesuai data Anda
csv_path = '../database/data/wikipediaITB.csv'
df = pd.read_csv(csv_path)
df
# Preprocessing kolom pertanyaan

# Test algoritma dengan beberapa query
# Load data dari file yang tersedia
csv_path = '../database/data/wikipediaITB.csv'
df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} rows from {csv_path}")
print(f"Columns: {list(df.columns)}")
print("\nSample data:")
print(df.head())

# Test preprocessing dan algoritma
test_queries = [
    "Apa itu ITB?",
    "Sejarah Institut Teknologi Bandung",
    "Fakultas di ITB",
    "Cara mendaftar ke ITB",
    "Lokasi kampus ITB"
]

print("\n=== Testing Algoritma Pencarian ===")
for query in test_queries:
    print(f"\nQuery: {query}")
    
    # Test preprocessing
    processed_query = preprocess(query)
    print(f"Processed: {processed_query}")
    
    # Test matching algorithm
    try:
        result = match_with_csv_data(query, threshold=0.3, top_k=2)
        if result:
            print(f"Match result: {result[:200]}...")  # Truncate long results
        else:
            print("No match found")
    except Exception as e:
        print(f"Matching error: {e}")
    
    # Test intent matching
    try:
        intent_result = matchIntent(query)
        if intent_result:
            print(f"Intent result: {intent_result[:200]}...")  # Truncate long results
        else:
            print("No intent match found")
    except Exception as e:
        print(f"Intent error: {e}")
        
print("\n=== Testing Jaccard Similarity ===")
test_text1 = "Institut Teknologi Bandung"
test_text2 = "ITB Bandung"
similarity = jaccardSimilarity(test_text1, test_text2)
print(f"Jaccard similarity between '{test_text1}' and '{test_text2}': {similarity}")


Loaded 1005 rows from ../database/data/wikipediaITB.csv
Columns: ['type', 'content', 'links']

Sample data:
  type            content                                  links
0   li      Halaman Utama                    /wiki/Halaman_Utama
1   li         Daftar isi                    /wiki/Wikipedia:Isi
2   li  Perubahan terbaru       /wiki/Istimewa:Perubahan_terbaru
3   li    Artikel pilihan  /wiki/Wikipedia:Artikel_pilihan/Topik
4   li  Peristiwa terkini         /wiki/Portal:Peristiwa_terkini

=== Testing Algoritma Pencarian ===

Query: Apa itu ITB?
Processed: apa itb
[MATCHING] Starting match for query: 'Apa itu ITB?'
[MATCHING] Processed query: 'apa itb'
[MATCHING] Found 28 candidates
[MATCHING] Best match: Tentang ITB... (score: 0.30, methods: ['jaccard(0.50)'])
Match result: Tentang ITB....
[MATCHING] matchIntent called with: 'Apa itu ITB?'
[MATCHING] Starting match for query: 'Apa itu ITB?'
[MATCHING] Processed query: 'apa itb'
[MATCHING] Found 28 candidates
[MATCHING] Best match: