In [19]:
import pandas as pd
from googletrans import Translator

In [5]:
# 1. CSV 파일 읽기
df = pd.read_csv("data/전처리.csv", encoding="utf-8")

# 2. 데이터 탐색
print(df.info())
print(df.describe())

# 3. 숫자형 컬럼 변환
num_cols = ['1회 섭취량(g)', 'kcal', '탄수화물(g)', '단백질(g)', '지방(g)', '당류(g)']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# 4. 결측치 확인 및 처리
print(df.isnull().sum())
df = df.dropna()

# 5. 중복 제거
df = df.drop_duplicates()

# 6. 숫자값 반올림 (예: 소수점 1자리)
df[num_cols] = df[num_cols].round(1)

# 7.고유 식별자 생성 – 예를 들어, 식품종류와 상세분류을 결합
df['고유ID'] = df['식품종류'] + "_" + df['상세분류']

# 8. 필요없는 columns 제거
df = df.drop(['Unnamed: 0', '식품종류', '상세분류'], axis=1)

# 9. column 정렬
df_col = ['고유ID', '식품명', '1회 섭취량(g)', 'kcal', '탄수화물(g)', '단백질(g)', '지방(g)', '당류(g)']
df = df[df_col]

# 10. 전처리 완료된 데이터를 새 CSV로 저장
df.to_csv("data/data1.csv", index=False, encoding="utf-8")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52940 entries, 0 to 52939
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  52940 non-null  int64  
 1   식품종류        52940 non-null  object 
 2   식품명         52940 non-null  object 
 3   상세분류        52940 non-null  object 
 4   1회 섭취량(g)   52940 non-null  float64
 5   kcal        52940 non-null  object 
 6   탄수화물(g)     52940 non-null  object 
 7   단백질(g)      52940 non-null  object 
 8   지방(g)       52940 non-null  object 
 9   당류(g)       52940 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 4.0+ MB
None
         Unnamed: 0       1회 섭취량(g)
count  52940.000000    52940.000000
mean   26469.500000      378.158917
std    15282.605962    15055.322609
min        0.000000        0.030000
25%    13234.750000       70.000000
50%    26469.500000      100.000000
75%    39704.250000      189.000000
max    52939.000000  1000000.000000
Unnamed:

In [4]:
df.head()

Unnamed: 0,고유ID,식품명,1회 섭취량(g),kcal,탄수화물(g),단백질(g),지방(g),당류(g)
0,음식_육류구이,꿩불고기,500.0,368.8,39.7,33.5,8.5,16.9
1,음식_육류구이,닭갈비,400.0,595.6,44.9,45.9,25.8,21.2
2,음식_육류구이,닭갈비,300.0,558.5,23.1,45.5,31.6,8.5
3,음식_육류구이,닭꼬치,70.0,176.7,13.3,11.6,8.6,3.2
4,음식_채소류구이,더덕구이,100.0,184.0,31.1,3.1,5.2,11.6


# json file data 정리

In [6]:
# json 전처리 data read
food_json = pd.read_csv('data/json_food_data.csv')
print(food_json.head())

             Code Name              식품명  1회 섭취량(g)  당류(g)
0  B020401XX_00865.jpg  bltsaendeuwichi        NaN    NaN
1  B020401XX_00866.jpg  bltsaendeuwichi        NaN    NaN
2  B020401XX_00866.jpg  bltsaendeuwichi        NaN    NaN
3  B020401XX_00867.jpg  bltsaendeuwichi        NaN    NaN
4  B020401XX_00867.jpg  bltsaendeuwichi        NaN    NaN


In [None]:
food_json.shape

In [17]:
# 두파일 coloumns 확인
# 기존 음식 data
print(df.columns)
print()
# json 파일 data
print(food_json.columns)

Index(['고유ID', '식품명', '1회 섭취량(g)', 'kcal', '탄수화물(g)', '단백질(g)', '지방(g)',
       '당류(g)'],
      dtype='object')

Index(['Code Name', '식품명', '1회 섭취량(g)', '당류(g)'], dtype='object')


In [23]:
import pandas as pd
import time
from googletrans import Translator

# 1️⃣ 데이터 로드
json_food_df = pd.read_csv("data/json_food_data.csv")

# 2️⃣ 'Code Name' 컬럼 제거
json_food_df = json_food_df.drop(columns=["Code Name"], errors="ignore")

# 3️⃣ 중복 제거 (고유한 식품명만 추출)
unique_food_names = json_food_df["식품명"].dropna().unique()

# 4️⃣ 번역기 객체 생성
translator = Translator()

# 5️⃣ 영어식품명 → 한글 변환 함수
def translate_text(text):
    try:
        if not isinstance(text, str) or text.strip() == "":  # 빈 값 체크
            return text
        translated = translator.translate(text, src='en', dest='ko')  # 번역 실행
        time.sleep(1)  # Google API 차단 방지
        return translated.text
    except Exception as e:
        print(f"⚠️ 번역 오류: {text} - {e}")
        return text  # 오류 발생 시 원본 반환

# 6️⃣ 중복 없는 데이터만 번역
translated_dict = {name: translate_text(name) for name in unique_food_names}

# 7️⃣ 원본 데이터에 번역된 값 적용
json_food_df["식품명"] = json_food_df["식품명"].map(translated_dict)

# 8️⃣ 저장
json_food_df.to_csv("json_food_data_translated.csv", index=False, encoding="utf-8-sig")
print("✅ 번역 완료: json_food_data_translated.csv 저장 완료")


✅ 번역 완료: json_food_data_translated.csv 저장 완료


In [24]:
jfdt = pd.read_csv('json_food_data_translated.csv')
jfdt

Unnamed: 0,식품명,1회 섭취량(g),당류(g)
0,Bltsaendeuwichi,,
1,Bltsaendeuwichi,,
2,Bltsaendeuwichi,,
3,Bltsaendeuwichi,,
4,Bltsaendeuwichi,,
...,...,...,...
3133526,Huinjug,,
3133527,Huinjug,,
3133528,Huinjug,,
3133529,Huinjug,,


In [32]:
unique_food_names

array(['bltsaendeuwichi', 'galaetteog', 'galibi', 'gaji', '가지',
       'gacheudong', 'ganjang', 'galbigu-i', 'galbijjim', 'galbitang',
       'gam', 'tangerine_juice', 'gamjagu-i', 'gamjageulatang',
       'gamjabokk-eum', 'gamjasaelleodeu', 'gamjaseupeu', 'potato_soup',
       'potato_chips', 'gamjachib', 'french_fries', 'gamjatwigim',
       'geobongpodo', 'geonjadu', 'geonkeulaenbeli', 'geonpodo',
       'gemas-sal', 'gejang', 'gyeojasoseu', 'mustard_sauce', 'gyeongdan',
       'gogalbi', 'goguma', 'gungoguma', 'mackerel', 'godeung-eo',
       'godeung-eogu-i', 'goloke', 'goleugonjolla', 'goleugonjollapija',
       'gochu', 'chili', 'gochujang', 'gochujang-ajji', 'gochusgalu',
       'goldkiwi', 'gobchangjeongol', 'gojgam', 'gwailjuseu',
       'gwailchaesosaelleodeu', 'fruit_vegetable_salad', 'guaba', 'guava',
       'gugsu', 'gughwacha', 'gunmandu', 'gunbam', 'gyul', 'geulatang',
       'geulin-ollibeu', 'gim', 'gimgu-i', 'gimmakki', 'gimbab',
       'gimchigug', 'gimchibokk-eumba

In [31]:
jfdt['식품명'].dropna().unique()

array(['Bltsaendeuwichi', 'Galaetteog', '갈리비', '가지', 'Gacheudong', '간장',
       'Galbigu-i', '갈비함', '갈비당', '게임', 'Tangerine_juice', 'Gamjagu-i',
       'Gamjageulatang', 'gamjabokk-eum', 'gamjasaelleodeu',
       'Gamjaseupeu', 'patato_soup', 'patato_chips', 'Gamjachib', '프랑스어',
       'gamjatwigim', 'Geobongpodo', 'genjadu', 'geonkeulaenbeli',
       'geonpodo', 'Gemas-sal', '제정', 'Gyeojasoseu', '머스타드 _sauce',
       'Gyeongdan', 'Gogalbi', 'Goguma', 'Gungoguma', '고등어', 'Godeung-eo',
       'Godeung-Eogu-i', '골로크', 'Goleugonjolla', 'goleugonjollapija',
       '고쿠', '칠리', '고추 장', 'Gochujang-Ajji', 'GOCHUSGALU', 'Goldkiwi',
       'GOBCHANGJEONGOL', 'gojgam', 'Gwailjuseu', 'Gwailchaesosaelleodeu',
       'fruit_vegetable_salad', '구아바', '구그수', '구르와 차', '건만두', '건 햄',
       'Gyul', 'Geulatang', 'Geulin-Ollibeu', '김', 'Gimgu-i', 'Gimmakki',
       '짐바브', 'Gimchigug', 'Gimchibokk-Eumbab', 'Gimchijjigae',
       'KKALEUBOLANA', 'Kkamangbeleuchijeu', 'Kkanpung-Gi', 'KKEOM',
       '조가비', 'Kku