In [46]:
# mount your own google drive
from google.colab import drive
drive.mount('/content/drive')

# change working directory 
%cd /content/drive/My Drive/Assignment2

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Assignment2


In [56]:
%%capture
import urllib.request
import json

# request dataset in json file from url
url = "http://seppe.net/aa/assignment2/dataset.json"
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode())

# Initialize DataFrame
df = pd.DataFrame(columns=['image_id', 'cuisine_type', 'country'])

def extract_values(obj, keys):
    """Recursively extract values from nested JSON."""
    arr = []

    def extract(obj, arr, keys):
        """Recursively search for values of key in JSON tree."""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if k in keys:
                    arr.append(v)
                elif isinstance(v, (dict, list)):
                    extract(v, arr, keys)
        elif isinstance(obj, list):
            for item in obj:
                extract(item, arr, keys)
        return arr

    results = extract(obj, arr, keys)
    return results

for record in data:
    # Extract latitude, longitude, and country
    country = extract_values(record, ["name"])

    # Extract image_id and cuisine_type
    image_id = extract_values(record['more_details'], ['image_id'])
    cuisine_type = extract_values(record['cuisines'], ['label'])

    # Flatten lists and append to DataFrame
    for ctry, img_id, cuisine in zip(country, image_id, cuisine_type):
        df = df.append({
            'image_id': img_id,
            'cuisine_type': cuisine,
            'country': ctry
        }, ignore_index=True)


In [57]:
print(df.head(10))
print(vars(df))
df['cuisine_label'] = df['cuisine_type']

  image_id    cuisine_type   country
0  2088757           Greek  Brussels
1  1956014   Modern French  Brussels
2  2200565         Seafood  Brussels
3  2640075   Modern French  Brussels
4  1955141         Seafood  Brussels
5  2076232         Seafood  Brussels
6  5637014         Organic  Brussels
7  5637083  Market Cuisine   Belgium
8  1955135        Japanese  Brussels
9  4186019         Belgian  Brussels
{'_is_copy': None, '_mgr': BlockManager
Items: Index(['image_id', 'cuisine_type', 'country'], dtype='object')
Axis 1: RangeIndex(start=0, stop=20431, step=1)
ObjectBlock: slice(0, 3, 1), 3 x 20431, dtype: object, '_item_cache': {}, '_attrs': {}, '_flags': <Flags(allows_duplicate_labels=True)>}


In [58]:
# check uniqueness and order by counts
cuisine_name = df['cuisine_label'].unique().tolist()
counts = df['cuisine_label'].value_counts()
percentages = df['cuisine_label'].value_counts(normalize=True)
uniqueness_result = pd.concat([counts, percentages], axis=1, keys=['Counts', 'Percentages'])
uniqueness_order = uniqueness_result.sort_values(by='Counts', ascending=False)
print(uniqueness_order)

                        Counts  Percentages
Modern Cuisine            3656     0.178944
Traditional Cuisine       1296     0.063433
Creative                  1216     0.059517
Contemporary              1047     0.051246
Italian                    702     0.034360
...                        ...          ...
Guizhou                      1     0.000049
Hawaiian                     1     0.000049
Hunanese and Sichuan         1     0.000049
Shellfish Specialities       1     0.000049
Xinjiang                     1     0.000049

[273 rows x 2 columns]


In [59]:
# Combine labels within the same cuisine, we choose four cuisines here
cuisine_label_Chinese = ['Chinese','Sichuan','Cantonese','Shanghainese','Taiwanese',
                         'Huaiyang', 'Shandong', 'Beijing Cuisine', 'Taizhou',
                         'Hunanese', 'Yunnanese', 'Chinese Contemporary', 'Dumplings',
                         'Hotpot','Chao Zhou', 'Jiangzhe', 'Hubei', 'Dongbei', 'Hui Cuisine',
                         'Ningbo', 'Dim Sum','Fujian', 'Noodles and Congee', 'Hang Zhou',
                         'Guizhou', 'Teochew','Shaanxi','Zhejiang','Hainanese','Xibei',
                         'Cantonese Roast Meats','Chiu Chow','Xinjiang']
cuisine_label_French = ['Modern French','Classic French','Cuisine from South West France' ,
                        'Creative French','Lyonnaise','Cuisine from Franche-Comté']
cuisine_label_Italian = ['Italian','Sicilian','Roman','Italian Contemporary','Italian and Japanese']
cuisine_label_Japanese = ['Japanese','Sushi','Yakiniku','Ramen','Japanese Contemporary','Sukiyaki',
                          'Japanese Steakhouse','Tonkatsu', 'Soba', 'Shojin', 'Oden',
                          'Yoshoku', 'Fugu / Pufferfish','Kushiage','Shabu-shabu']

In [60]:
# Re-label for cuisines
cuisine_mapping = {
    label: 'Chinese' for label in cuisine_label_Chinese
}
cuisine_mapping.update({
    label: 'French' for label in cuisine_label_French
})
cuisine_mapping.update({
    label: 'Italian' for label in cuisine_label_Italian
})
cuisine_mapping.update({
    label: 'Japanese' for label in cuisine_label_Japanese
})

df['cuisine_new'] = df['cuisine_label'].apply(lambda x: cuisine_mapping.get(x, 'Other'))

# check uniqueness and order by counts
cuisine_name = df['cuisine_new'].unique().tolist()
counts = df['cuisine_new'].value_counts()
percentages = df['cuisine_new'].value_counts(normalize=True)
uniqueness_result = pd.concat([counts, percentages], axis=1, keys=['Counts', 'Percentages'])
uniqueness_order = uniqueness_result.sort_values(by='Counts', ascending=False)
print(uniqueness_order)

          Counts  Percentages
Other      17045     0.834271
Japanese    1061     0.051931
Italian      945     0.046253
Chinese      813     0.039792
French       567     0.027752


In [61]:
df_new = df
print(df_new.head(10))

  image_id    cuisine_type   country   cuisine_label cuisine_new
0  2088757           Greek  Brussels           Greek       Other
1  1956014   Modern French  Brussels   Modern French      French
2  2200565         Seafood  Brussels         Seafood       Other
3  2640075   Modern French  Brussels   Modern French      French
4  1955141         Seafood  Brussels         Seafood       Other
5  2076232         Seafood  Brussels         Seafood       Other
6  5637014         Organic  Brussels         Organic       Other
7  5637083  Market Cuisine   Belgium  Market Cuisine       Other
8  1955135        Japanese  Brussels        Japanese    Japanese
9  4186019         Belgian  Brussels         Belgian       Other


In [62]:
import random

# Select 800 random observations with 'cuisine_new' equal to 'Other', make 'equally' classes
random_other = df_new[df_new['cuisine_new'] == 'Other'].sample(n=800, random_state=42)
random_other = random_other[['image_id', 'cuisine_new']]
# Select variables and observations with four cuisines
Cuisines = df_new[df_new['cuisine_new'] != 'Other']
Cuisines = Cuisines[['image_id', 'cuisine_new']]
# Merge two dataframes
df_output = pd.concat([Cuisines, random_other])
df_output = df_output.rename(columns={'image_id': 'id','cuisine_new': 'cuisine'})
print("Number of rows:", len(df_output))
print(df_output.head(10))

Number of rows: 4186
         id   cuisine
1   1956014    French
3   2640075    French
8   1955135  Japanese
13  2638290    French
15  1869678    French
20  1956968   Italian
27  1984981   Italian
28  1984983   Italian
38  2229171    French
39  4609033   Italian


In [63]:
# save into csv file
df_output.to_csv('cuisine_df.csv', index=False)