In [None]:
%%capture
import urllib.request
import json

# request dataset in json file from url
url = "http://seppe.net/aa/assignment2/dataset.json"
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode())

# import data and convert into data frame
import pandas as pd
df=pd.DataFrame(columns=['i','identifier','objectID','cuisine_label','lat','lng','country','city'])
for i, result in enumerate(data):
    cuisines=result["cuisines"]
    identifier=result["identifier"]
    objectID=result["objectID"]
    lat=result['_geoloc']['lat']
    lng=result['_geoloc']['lng']
    country=result['country']['name']
    city=result['city']['name']
    for m,cuisine in enumerate(cuisines):
        cuisine_label=cuisine['label']
        df=df.append({'i':i,
                      'identifier':identifier,
                      'objectID':objectID,
                      'cuisine_label':cuisine_label,
                     'lat':lat,
                     'lng':lng,
                     'country':country,
                     'city':city},              
                     ignore_index=True)

In [None]:
print(df.head(10))
print(vars(df))

   i identifier objectID    cuisine_label        lat       lng  country  \
0  0     209384    78848            Greek  50.853009  4.345910  Belgium   
1  1     344373    79182    Modern French  50.852798  4.345864  Belgium   
2  2     114750    79041          Seafood  50.852796  4.345792  Belgium   
3  3     233186    79268    Modern French  50.852955  4.345066  Belgium   
4  4     113946    79301          Seafood  50.850924  4.347886  Belgium   
5  4     113946    79301  Classic Cuisine  50.850924  4.347886  Belgium   
6  5     264563    79040          Seafood  50.851104  4.349947  Belgium   
7  6    1188703    79360          Organic  50.856913  4.346896  Belgium   
8  6    1188703    79360   Market Cuisine  50.856913  4.346896  Belgium   
9  7     113876    78709         Japanese  50.850575  4.354412  Belgium   

       city  
0  Brussels  
1  Brussels  
2  Brussels  
3  Brussels  
4  Brussels  
5  Brussels  
6  Brussels  
7  Brussels  
8  Brussels  
9  Brussels  
{'_is_copy': None, '

In [None]:
# check uniqueness and order by counts
cuisine_name = df['cuisine_label'].unique().tolist()
counts = df['cuisine_label'].value_counts()
percentages = df['cuisine_label'].value_counts(normalize=True)
uniqueness_result = pd.concat([counts, percentages], axis=1, keys=['Counts', 'Percentages'])
uniqueness_order = uniqueness_result.sort_values(by='Counts', ascending=False)
print(uniqueness_order)

                       Counts  Percentages
Modern Cuisine           3764     0.176988
Traditional Cuisine      1381     0.064936
Creative                 1225     0.057601
Contemporary             1076     0.050595
Market Cuisine            746     0.035078
...                       ...          ...
Burgundian                  1     0.000047
Egyptian                    1     0.000047
Anago / Saltwater Eel       1     0.000047
Hawaiian                    1     0.000047
Xinjiang                    1     0.000047

[275 rows x 2 columns]


In [None]:
# Combine labels within the same cuisine, we choose four cuisines here
cuisine_label_Chinese = ['Chinese','Sichuan','Cantonese','Shanghainese','Taiwanese',
                         'Huaiyang', 'Shandong', 'Beijing Cuisine', 'Taizhou',
                         'Hunanese', 'Yunnanese', 'Chinese Contemporary', 'Dumplings',
                         'Hotpot','Chao Zhou', 'Jiangzhe', 'Hubei', 'Dongbei', 'Hui Cuisine',
                         'Ningbo', 'Dim Sum','Fujian', 'Noodles and Congee', 'Hang Zhou',
                         'Guizhou', 'Teochew','Shaanxi','Zhejiang','Hainanese','Xibei',
                         'Cantonese Roast Meats','Chiu Chow','Xinjiang']
cuisine_label_French = ['Modern French','Classic French','Cuisine from South West France' ,
                        'Creative French','Lyonnaise','Cuisine from Franche-Comté']
cuisine_label_Italian = ['Italian','Sicilian','Roman','Italian Contemporary','Italian and Japanese']
cuisine_label_Japanese = ['Japanese','Sushi','Yakiniku','Ramen','Japanese Contemporary','Sukiyaki',
                          'Japanese Steakhouse','Tonkatsu', 'Soba', 'Shojin', 'Oden',
                          'Yoshoku', 'Fugu / Pufferfish','Kushiage','Shabu-shabu']

In [None]:
# Re-label for cuisines
cuisine_mapping = {
    label: 'Chinese' for label in cuisine_label_Chinese
}
cuisine_mapping.update({
    label: 'French' for label in cuisine_label_French
})
cuisine_mapping.update({
    label: 'Italian' for label in cuisine_label_Italian
})
cuisine_mapping.update({
    label: 'Japanese' for label in cuisine_label_Japanese
})

df['cuisine_new'] = df['cuisine_label'].apply(lambda x: cuisine_mapping.get(x, 'Other'))

# check uniqueness and order by counts
cuisine_name = df['cuisine_new'].unique().tolist()
counts = df['cuisine_new'].value_counts()
percentages = df['cuisine_new'].value_counts(normalize=True)
uniqueness_result = pd.concat([counts, percentages], axis=1, keys=['Counts', 'Percentages'])
uniqueness_order = uniqueness_result.sort_values(by='Counts', ascending=False)
print(uniqueness_order)

          Counts  Percentages
Other      17763     0.835238
Japanese    1092     0.051347
Italian      977     0.045940
Chinese      843     0.039639
French       592     0.027837


In [None]:
# select variables and observations with four cuisines
df = df[df['cuisine_new'] != 'Other']
df = df[['identifier', 'cuisine_new']]
df = df.rename(columns={'identifier':'id', 'cuisine_new':'cuisine'})
print(df.head(10))

         id   cuisine
1    344373    French
3    233186    French
9    113876  Japanese
14   445310    French
18   306344    French
23   504797   Italian
30   477155   Italian
31   477155   Italian
41   114641    French
42  1201928   Italian


In [None]:
# save into csv file
df.to_csv('cuisine_df.csv', index=False)