In [4]:
import pandas as pd
import json

In [5]:
with open('/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/bigcontest2024/crawling/naver-map-results.json', encoding='utf-8') as f:
    data = json.load(f)

In [6]:
# Flatten the data structure and prepare for EDA
flattened_data = []

for key, value in data.items():
    flat_row = {
        'MCT_NM': value.get('MCT_NM'),
        'ADDR': value.get('ADDR'),
        'OP_YMD': value.get('OP_YMD'),
        'unique_id': value.get('unique_id'),
        'image_url': value.get('image_url'),
        'rating': value.get('rating'),
        'rating_count': value.get('rating_count')
    }
    
    flattened_data.append(flat_row)

# Create DataFrame from flattened data
df = pd.DataFrame(flattened_data)

# Checking for missing values in each column
missing_data = df.isnull().sum()

# Basic statistics of missing data
total_rows = len(df)
missing_data_percent = (missing_data / total_rows) * 100

# Combine both missing count and percentage in a single DataFrame
missing_stats = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_data_percent
})

In [7]:
missing_stats

Unnamed: 0,Missing Count,Missing Percentage
MCT_NM,0,0.0
ADDR,0,0.0
OP_YMD,0,0.0
unique_id,0,0.0
image_url,364,4.326123
rating,3682,43.760399
rating_count,3682,43.760399


In [8]:
# Flattening the data structure to prepare for EDA
flattened_data_extended = []

for key, value in data.items():
    # Extracting coordinate data
    lat = value.get('coordinate', {}).get('lat')
    lng = value.get('coordinate', {}).get('lng')

    # Extracting review data
    reviews = value.get('review', {})
    review_count = len(reviews)
    visit_keywords_empty = sum([1 for r in reviews.values() if not r.get('visit_keywords')])

    flat_row = {
        'MCT_NM': value.get('MCT_NM'),
        'lat': lat,
        'lng': lng,
        'review_count': review_count,
        'empty_visit_keywords_count': visit_keywords_empty,
    }

    flattened_data_extended.append(flat_row)

# Create DataFrame from extended flattened data
df_extended = pd.DataFrame(flattened_data_extended)

# Checking for missing values in 'lat' and 'lng' (coordinate)
df_extended['lat_missing'] = df_extended['lat'].isnull()
df_extended['lng_missing'] = df_extended['lng'].isnull()

# Check if review count is less than 100
df_extended['review_less_than_100'] = df_extended['review_count'] < 100

# Check percentage of empty visit_keywords in reviews
df_extended['empty_visit_keywords_percentage'] = (
    df_extended['empty_visit_keywords_count'] / df_extended['review_count'] * 100
)

# Aggregating missing stats for 'lat', 'lng', and review-related statistics
missing_stats_extended = pd.DataFrame({
    'lat_missing': df_extended['lat_missing'].sum(),
    'lng_missing': df_extended['lng_missing'].sum(),
    'reviews_less_than_100': df_extended['review_less_than_100'].sum(),
    'average_empty_visit_keywords_percentage': df_extended['empty_visit_keywords_percentage'].mean()
}, index=['Count'])

In [9]:
missing_stats_extended

Unnamed: 0,lat_missing,lng_missing,reviews_less_than_100,average_empty_visit_keywords_percentage
Count,0,0,4028,59.680928


In [10]:
df_extended

Unnamed: 0,MCT_NM,lat,lng,review_count,empty_visit_keywords_count,lat_missing,lng_missing,review_less_than_100,empty_visit_keywords_percentage
0,(주)산굼부리,33.434218,126.688763,100,83,False,False,False,83.000000
1,(주)삼다도횟집,33.519072,126.499271,100,80,False,False,False,80.000000
2,(주)서민당,33.498751,126.544903,100,76,False,False,False,76.000000
3,(주)엠브릿지,33.210116,126.257132,100,76,False,False,False,76.000000
4,(주)옹포바다횟집,33.403852,126.251705,100,10,False,False,False,10.000000
...,...,...,...,...,...,...,...,...,...
8409,히아담,33.476083,126.486466,100,89,False,False,False,89.000000
8410,히치하이커스라운지,33.499080,126.528776,100,37,False,False,False,37.000000
8411,히포파운드,33.485580,126.460482,100,75,False,False,False,75.000000
8412,힘찬장어,33.490493,126.492436,22,22,False,False,True,100.000000
