In [1]:
import json
import os
import pandas as pd
import seaborn as sns
import numpy as np

In [23]:
path_to_json = '/Users/ploy/Programming/thesis/data/json-scrape/resturant'
json_files = [f for f in os.listdir(path_to_json) if f.endswith('.json')]
print(f'Found {len(json_files)} JSON files: {json_files}')

Found 20 JSON files: ['ร้าน_ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง_(Rungroj_Rural_Noodle_-_Boonsri_Lampang)_สาขาเชียงใหม่.json', "Lucky's_-_Vietnamese_Restaurant_Chiang_Mai.json", 'ร้าน_ไข่กูส์_ข้าวไข่เจียวบุฟเฟ่ต์_(Eggs_Goos_Omelette_Rice_Buffet_Maejho).json', 'Heaven_Whip.json', 'ร้าน_สหายหมื่นจอก_(Ten_Thousand_Cups_Friends).json', 'Little_Red_Oven_Hangdong_Samoeng_หนองควาย.json', 'TASTE_ATELIERWeave_Artisan_Society.json', "Imagine's_House_:_บ้านสวน.json", 'Haikin_Ryokan_Japanese_Restaurant_(ฮายคิน_เรียวกัง).json', 'ผัดไททักษิณ_(สีวลี)_(Pad_Thai_Thaksin_(Siwali)).json', 'โจ๊กหมู_ข้าวต้มปลานายเจียง_(Pork_Jok_Fish_Porridge_Nai_Jiang).json', 'ร้าน_เรือนรักษ์ไทย2.json', 'ร้าน_เฮียพจน์_ข้าวมันไก่ตอน.json', 'Cuisine_de_Garden_Chiangmai_(คูซีน_เดอ_การ์เดน)Chiangmai.json', 'Yellow_plate_(เยลโล่_เพลท).json', 'Pool_House_(พูลเฮาส์)โรงแรมเชียงใหม่_แมริออท_โฮเทล.json', 'Grow_Cafeทรายมูล.json', 'ครัวสะบายดี.json', 'ร้าน_นาซิ_จำปู๋_(Nasi_Jumpru).json', 'ขนมหวาย_(khanomwaii)1.json']


In [24]:
# Load and examine the structure of JSON files
sample_files = json_files[:3]  # Get first 3 files as samples
all_data = []

for file in sample_files:
    file_path = os.path.join(path_to_json, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        all_data.append({
            'filename': file,
            'data': data
        })
        print(f"\n--- {file} ---")
        print(f"Keys: {list(data.keys())}")
        # Show sample data for each key
        for key, value in data.items():
            if isinstance(value, str):
                print(f"{key}: {value[:100]}..." if len(value) > 100 else f"{key}: {value}")
            else:
                print(f"{key}: {type(value)} - {value}")
        print("-" * 50)


--- ร้าน_ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง_(Rungroj_Rural_Noodle_-_Boonsri_Lampang)_สาขาเชียงใหม่.json ---
Keys: ['name', 'address', 'Review_rating', 'description_place', 'time_to_open_in_right_block', 'UserComment_review_not_emoji']
name: ร้าน ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง (Rungroj Rural Noodle - Boonsri Lampang) สาขาเชียงใหม่
address: 111/3, หนองควาย หางดง เชียงใหม่ เชียงใหม่ (เลยบิ้กซีมินิมานิดหน่อย ร้านจะอยู่ซ้ายมือ)
Review_rating: 3.7
description_place: ก๋วยเตี๋ยว
time_to_open_in_right_block: 09:00 - 20:00
UserComment_review_not_emoji: ร้านก๋วยเตี๋ยวเจ้าเด็ดจากลำปาง เสิร์ฟตรงสู่เชียงใหม่ รสชาติอร่อย ราคาไม่แพง เหมาะสำหรับเพื่อนฝูงและท...
--------------------------------------------------

--- Lucky's_-_Vietnamese_Restaurant_Chiang_Mai.json ---
Keys: ['name', 'address', 'Review_rating', 'description_place', 'time_to_open_in_right_block', 'UserComment_review_not_emoji']
name: Lucky's - Vietnamese Restaurant Chiang Mai
address: ถนนกระดังงา เชียงใหม่
Review_rating: 4.0


In [25]:
import re
import emoji

# Thai stop words list (common words that don't add much meaning)
thai_stopwords = [
    'ใน', 'ของ', 'และ', 'ที่', 'จะ', 'ได้', 'มี', 'แล้ว', 'ไป', 'มา', 'เป็น', 'ให้', 'ก็', 'แต่', 'ถ้า',
    'ถึง', 'กับ', 'จาก', 'โดย', 'ตาม', 'ซึ่ง', 'หรือ', 'ทำ', 'ใช้', 'ขณะ', 'คือ', 'ผล', 'ดัง', 'นั้น',
    'นี้', 'นั่น', 'อัน', 'เมื่อ', 'ถือ', 'ช่วง', 'ระหว่าง', 'ผ่าน', 'ต่อ', 'ตั้งแต่', 'ถึงแม้','...',
    'เพื่อ', 'เนื่องจาก', 'เพราะ', 'อย่าง', 'ลักษณะ', 'รูปแบบ', 'เช่น', 'เป็นต้น', 'ดังนั้น','ค่ะ','ครับ','นะ','จ้า','จ้ะ','จ๊ะ','อ่ะ','อะ','อ่ะนะ','อ่ะค่ะ','อ่ะครับ','อ่ะจ้า','อ่ะจ้ะ','อ่ะจ๊ะ',
]

def clean_text(text):
    """
    Clean text by removing emojis, special characters, stop words, and normalizing whitespace
    """
    if not isinstance(text, str) or not text.strip():
        return text

    original_text = text

    # Remove emojis
    text = emoji.demojize(text, delimiters=("", ""))
    text = re.sub(r':[a-zA-Z_]+:', '', text)  # Remove emoji codes like :smile:

    # Remove newlines and replace with spaces
    text = text.replace('\n', ' ').replace('\r', ' ')

    # Remove special characters but keep Thai characters and basic punctuation
    text = re.sub(r'[^\u0E00-\u0E7Fa-zA-Z0-9\s\.\,\!\?\:\;\-\(\)\[\]\{\}]', '', text)

    # Clean up multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove Thai stop words
    words = text.split()
    filtered_words = [word for word in words if word.strip() and word not in thai_stopwords]
    text = ' '.join(filtered_words)

    # Strip leading/trailing whitespace
    text = text.strip()

    return text

def show_before_after(original, cleaned, title="Text Cleaning"):
    """Display before and after comparison"""
    print(f"\n=== {title} ===")
    print(f"Before: {original[:200]}..." if len(original) > 200 else f"Before: {original}")
    print(f"After:  {cleaned[:200]}..." if len(cleaned) > 200 else f"After:  {cleaned}")
    print(f"Length: {len(original)} → {len(cleaned)}")
    print("-" * 80)

In [26]:
# Process all JSON files and clean the text data
cleaned_data_list = []
cleaning_stats = {
    'total_files': 0,
    'total_fields_cleaned': 0,
    'characters_removed': 0,
    'emojis_found': 0
}

# Text fields that need cleaning
# text_fields = ['name', 'address', 'description_place', 'UserComment_review_not_emoji','Main_facilities_description',]
text_fields = ['name', 'address', 'description_place', 'UserComment_review_not_emoji']

print("=== CLEANING ALL JSON FILES ===")
print(f"Total files to process: {len(json_files)}")
print()

for i, file in enumerate(json_files):
    file_path = os.path.join(path_to_json, file)

    # Show progress
    if (i + 1) % 10 == 0 or i == 0:
        print(f"Processing file {i + 1}/{len(json_files)}: {file}")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        cleaned_data = data.copy()
        original_lengths = {}
        cleaned_lengths = {}

        # Clean text fields
        for field in text_fields:
            if field in data and isinstance(data[field], str):
                original_text = data[field]
                cleaned_text = clean_text(original_text)

                original_lengths[field] = len(original_text)
                cleaned_lengths[field] = len(cleaned_text)

                # Update cleaned data
                cleaned_data[field] = cleaned_text

                # Update stats
                cleaning_stats['total_fields_cleaned'] += 1
                cleaning_stats['characters_removed'] += len(original_text) - len(cleaned_text)

                # Count emojis (simple detection)
                emoji_pattern = r'[\U0001F300-\U0001F9FF]|[\u2600-\u27BF]|[\U0001F1E0-\U0001F1FF]'
                emoji_count = len(re.findall(emoji_pattern, original_text))
                cleaning_stats['emojis_found'] += emoji_count

                # Show examples for first few files
                if i < 3:
                    show_before_after(original_text, cleaned_text, f"{file} - {field}")

        # Handle time_to_open_in_right_block (list field)
        if 'time_to_open_in_right_block' in data and isinstance(data['time_to_open_in_right_block'], list):
            cleaned_times = []
            for time_text in data['time_to_open_in_right_block']:
                if isinstance(time_text, str):
                    cleaned_time = clean_text(time_text)
                    cleaned_times.append(cleaned_time)
                    cleaning_stats['total_fields_cleaned'] += 1
                    cleaning_stats['characters_removed'] += len(time_text) - len(cleaned_time)
                else:
                    cleaned_times.append(time_text)
            cleaned_data['time_to_open_in_right_block'] = cleaned_times

        cleaned_data_list.append({
            'filename': file,
            'original_data': data,
            'cleaned_data': cleaned_data,
            'original_lengths': original_lengths,
            'cleaned_lengths': cleaned_lengths
        })

        cleaning_stats['total_files'] += 1

    except Exception as e:
        print(f"Error processing {file}: {e}")

print(f"\n=== CLEANING COMPLETED ===")
print(f"Files processed: {cleaning_stats['total_files']}")
print(f"Text fields cleaned: {cleaning_stats['total_fields_cleaned']}")
print(f"Characters removed: {cleaning_stats['characters_removed']:,}")
print(f"Emojis found and removed: {cleaning_stats['emojis_found']}")

=== CLEANING ALL JSON FILES ===
Total files to process: 20

Processing file 1/20: ร้าน_ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง_(Rungroj_Rural_Noodle_-_Boonsri_Lampang)_สาขาเชียงใหม่.json

=== ร้าน_ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง_(Rungroj_Rural_Noodle_-_Boonsri_Lampang)_สาขาเชียงใหม่.json - name ===
Before: ร้าน ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง (Rungroj Rural Noodle - Boonsri Lampang) สาขาเชียงใหม่
After:  ร้าน ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง (Rungroj Rural Noodle - Boonsri Lampang) สาขาเชียงใหม่
Length: 98 → 98
--------------------------------------------------------------------------------

=== ร้าน_ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง_(Rungroj_Rural_Noodle_-_Boonsri_Lampang)_สาขาเชียงใหม่.json - address ===
Before: 111/3, หนองควาย หางดง เชียงใหม่ เชียงใหม่ (เลยบิ้กซีมินิมานิดหน่อย ร้านจะอยู่ซ้ายมือ)
After:  1113, หนองควาย หางดง เชียงใหม่ เชียงใหม่ (เลยบิ้กซีมินิมานิดหน่อย ร้านจะอยู่ซ้ายมือ)
Length: 85 → 84
---------------------------------------------------------

In [27]:
# Show some detailed examples of cleaning results
print("=== DETAILED CLEANING EXAMPLES ===")

# Show examples from a few files
for i in range(min(5, len(cleaned_data_list))):
    file_info = cleaned_data_list[i]
    print(f"\n--- FILE: {file_info['filename']} ---")

    for field in text_fields:
        if field in file_info['original_data'] and isinstance(file_info['original_data'][field], str):
            original = file_info['original_data'][field]
            cleaned = file_info['cleaned_data'][field]

            print(f"\n{field.upper()}:")
            print(f"Before ({len(original)} chars): {original[:150]}...")
            print(f"After ({len(cleaned)} chars):  {cleaned[:150]}...")
            print(f"Reduction: {len(original) - len(cleaned)} characters")
            print("-" * 60)

print("\n=== OVERALL STATISTICS ===")
print(f"Total files processed: {cleaning_stats['total_files']}")
print(f"Total text fields cleaned: {cleaning_stats['total_fields_cleaned']}")
print(f"Total characters removed: {cleaning_stats['characters_removed']:,}")
print(f"Average reduction per field: {cleaning_stats['characters_removed'] / cleaning_stats['total_fields_cleaned']:.1f} chars")
print(f"Emojis found and processed: {cleaning_stats['emojis_found']}")

# Character reduction by field type
field_stats = {}
for file_info in cleaned_data_list:
    for field in text_fields:
        if field in file_info['original_lengths']:
            if field not in field_stats:
                field_stats[field] = {'original_total': 0, 'cleaned_total': 0, 'count': 0}
            field_stats[field]['original_total'] += file_info['original_lengths'][field]
            field_stats[field]['cleaned_total'] += file_info['cleaned_lengths'][field]
            field_stats[field]['count'] += 1

print(f"\n=== REDUCTION BY FIELD TYPE ===")
for field, stats in field_stats.items():
    reduction = stats['original_total'] - stats['cleaned_total']
    reduction_percent = (reduction / stats['original_total'] * 100) if stats['original_total'] > 0 else 0
    avg_original = stats['original_total'] / stats['count']
    avg_cleaned = stats['cleaned_total'] / stats['count']

    print(f"{field}:")
    print(f"  Fields processed: {stats['count']}")
    print(f"  Average length: {avg_original:.1f} → {avg_cleaned:.1f} chars")
    print(f"  Total reduction: {reduction:,} chars ({reduction_percent:.1f}%)")
    print()

=== DETAILED CLEANING EXAMPLES ===

--- FILE: ร้าน_ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง_(Rungroj_Rural_Noodle_-_Boonsri_Lampang)_สาขาเชียงใหม่.json ---

NAME:
Before (98 chars): ร้าน ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง (Rungroj Rural Noodle - Boonsri Lampang) สาขาเชียงใหม่...
After (98 chars):  ร้าน ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง (Rungroj Rural Noodle - Boonsri Lampang) สาขาเชียงใหม่...
Reduction: 0 characters
------------------------------------------------------------

ADDRESS:
Before (85 chars): 111/3, หนองควาย หางดง เชียงใหม่ เชียงใหม่ (เลยบิ้กซีมินิมานิดหน่อย ร้านจะอยู่ซ้ายมือ)...
After (84 chars):  1113, หนองควาย หางดง เชียงใหม่ เชียงใหม่ (เลยบิ้กซีมินิมานิดหน่อย ร้านจะอยู่ซ้ายมือ)...
Reduction: 1 characters
------------------------------------------------------------

DESCRIPTION_PLACE:
Before (10 chars): ก๋วยเตี๋ยว...
After (10 chars):  ก๋วยเตี๋ยว...
Reduction: 0 characters
------------------------------------------------------------

USERCOMMENT_REVIEW_NOT_EMOJI:


In [28]:
# Save cleaned data to new directory
output_dir = '/Users/ploy/Programming/thesis/data/clean-data/cleaned-resturant'
os.makedirs(output_dir, exist_ok=True)

print(f"=== SAVING CLEANED DATA TO: {output_dir} ===")

saved_files = 0
for file_info in cleaned_data_list:
    try:
        output_file = os.path.join(output_dir, file_info['filename'])

        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(file_info['cleaned_data'], f, ensure_ascii=False, indent=2)

        saved_files += 1

        if saved_files <= 5:  # Show progress for first few files
            print(f"✓ Saved: {file_info['filename']}")

    except Exception as e:
        print(f"✗ Error saving {file_info['filename']}: {e}")

print(f"\n=== SAVE COMPLETE ===")
print(f"Files saved: {saved_files}/{len(cleaned_data_list)}")
print(f"Output directory: {output_dir}")

# Create a summary report
summary_report = {
    'cleaning_timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'source_directory': path_to_json,
    'output_directory': output_dir,
    'statistics': cleaning_stats,
    'field_statistics': field_stats,
    'files_processed': [info['filename'] for info in cleaned_data_list]
}

# Save summary report
summary_file = os.path.join(output_dir, 'cleaning_summary_report.json')
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(summary_report, f, ensure_ascii=False, indent=2, default=str)

print(f"\n✓ Summary report saved to: cleaning_summary_report.json")
print(f"\n=== TEXT CLEANING COMPLETED SUCCESSFULLY ===")
print(f"📁 Original files: {path_to_json}")
print(f"📁 Cleaned files: {output_dir}")
print(f"📊 Total reduction: {cleaning_stats['characters_removed']:,} characters")
print(f"🧹 Fields cleaned: {cleaning_stats['total_fields_cleaned']}")
print(f"😊 Emojis processed: {cleaning_stats['emojis_found']}")

=== SAVING CLEANED DATA TO: /Users/ploy/Programming/thesis/data/clean-data/cleaned-resturant ===
✓ Saved: ร้าน_ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง_(Rungroj_Rural_Noodle_-_Boonsri_Lampang)_สาขาเชียงใหม่.json
✓ Saved: Lucky's_-_Vietnamese_Restaurant_Chiang_Mai.json
✓ Saved: ร้าน_ไข่กูส์_ข้าวไข่เจียวบุฟเฟ่ต์_(Eggs_Goos_Omelette_Rice_Buffet_Maejho).json
✓ Saved: Heaven_Whip.json
✓ Saved: ร้าน_สหายหมื่นจอก_(Ten_Thousand_Cups_Friends).json

=== SAVE COMPLETE ===
Files saved: 20/20
Output directory: /Users/ploy/Programming/thesis/data/clean-data/cleaned-resturant

✓ Summary report saved to: cleaning_summary_report.json

=== TEXT CLEANING COMPLETED SUCCESSFULLY ===
📁 Original files: /Users/ploy/Programming/thesis/data/json-scrape/resturant
📁 Cleaned files: /Users/ploy/Programming/thesis/data/clean-data/cleaned-resturant
📊 Total reduction: 36 characters
🧹 Fields cleaned: 80
😊 Emojis processed: 0


In [29]:
# Final comparison: Show before and after examples from saved files
print("=== FINAL BEFORE/AFTER COMPARISON ===")
print("Comparing original vs cleaned files...\n")

# Pick a sample file to demonstrate
sample_file = cleaned_data_list[0]
sample_filename = sample_file['filename']

print(f"📄 SAMPLE FILE: {sample_filename}")
print("=" * 80)

# Load original file
original_path = os.path.join(path_to_json, sample_filename)
with open(original_path, 'r', encoding='utf-8') as f:
    original_data = json.load(f)

# Load cleaned file
cleaned_path = os.path.join(output_dir, sample_filename)
with open(cleaned_path, 'r', encoding='utf-8') as f:
    cleaned_data = json.load(f)

# Compare each text field
for field in text_fields:
    if field in original_data and isinstance(original_data[field], str):
        print(f"\n🔍 FIELD: {field}")
        print("-" * 60)

        original_text = original_data[field]
        cleaned_text = cleaned_data[field]

        print(f"📊 BEFORE ({len(original_text)} characters):")
        print(f"   {original_text[:200]}..." if len(original_text) > 200 else f"   {original_text}")

        print(f"\n📊 AFTER ({len(cleaned_text)} characters):")
        print(f"   {cleaned_text[:200]}..." if len(cleaned_text) > 200 else f"   {cleaned_text}")

        reduction = len(original_text) - len(cleaned_text)
        print(f"\n📉 REDUCTION: {reduction} characters ({reduction/len(original_text)*100:.1f}%)")

print(f"\n" + "=" * 80)
print("🎉 DATA CLEANING PROCESS COMPLETED SUCCESSFULLY!")
print(f"📁 All {len(cleaned_data_list)} files have been cleaned and saved to:")
print(f"   {output_dir}")
print(f"📊 Total improvements:")
print(f"   • {cleaning_stats['characters_removed']:,} characters removed")
print(f"   • {cleaning_stats['emojis_found']} emojis processed")
print(f"   • {cleaning_stats['total_fields_cleaned']} text fields cleaned")
print(f"📋 Summary report available at: cleaning_summary_report.json")

=== FINAL BEFORE/AFTER COMPARISON ===
Comparing original vs cleaned files...

📄 SAMPLE FILE: ร้าน_ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง_(Rungroj_Rural_Noodle_-_Boonsri_Lampang)_สาขาเชียงใหม่.json

🔍 FIELD: name
------------------------------------------------------------
📊 BEFORE (98 characters):
   ร้าน ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง (Rungroj Rural Noodle - Boonsri Lampang) สาขาเชียงใหม่

📊 AFTER (98 characters):
   ร้าน ก๋วยเตี๋ยวลูกทุ่งรุ่งโรจน์-บุญศรีลำปาง (Rungroj Rural Noodle - Boonsri Lampang) สาขาเชียงใหม่

📉 REDUCTION: 0 characters (0.0%)

🔍 FIELD: address
------------------------------------------------------------
📊 BEFORE (85 characters):
   111/3, หนองควาย หางดง เชียงใหม่ เชียงใหม่ (เลยบิ้กซีมินิมานิดหน่อย ร้านจะอยู่ซ้ายมือ)

📊 AFTER (84 characters):
   1113, หนองควาย หางดง เชียงใหม่ เชียงใหม่ (เลยบิ้กซีมินิมานิดหน่อย ร้านจะอยู่ซ้ายมือ)

📉 REDUCTION: 1 characters (1.2%)

🔍 FIELD: description_place
------------------------------------------------------------
📊 BEFO