In [1]:
import pandas as pd
import numpy as np
import cv2
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import os
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from mpstemmer import MPStemmer
%matplotlib inline

In [2]:
meta_df = pd.read_csv('data/meta.csv')
train_df = pd.read_csv('data/food_tagging_train.csv')
test_df = pd.read_csv('data/food_tagging_test.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
test_meta_df = pd.merge(test_df,meta_df,how='left',on=["item_id"])
train_meta_df = pd.merge(train_df,meta_df,how='left',on=["item_id"])
train_meta_df.columns

Index(['item_id', 'photo_x', 'name', 'price', 'Asian', 'Babi', 'Bakso',
       'Bebek', 'Beverages', 'Boiled', 'Bubur', 'Burger', 'Cake_and_Bread',
       'Chicken', 'Chinese', 'Coffee', 'Dessert_Sweet', 'Egg', 'Fish', 'Fries',
       'Gado2', 'Gorengan', 'Grilled', 'Indonesian', 'Italian', 'Japanese',
       'Kukus', 'Main_Course', 'Martabak', 'Middle_Eastern', 'Milk', 'Noodle',
       'Organic', 'Pempek', 'Personal', 'Pizza', 'Red_Meat', 'Rice', 'Salty',
       'Sate', 'Sauce', 'Seafood', 'Set_Menu', 'Sharing', 'Siomay',
       'Snack_Appetizer', 'Soup', 'Sour', 'Spicy', 'Tea', 'Vegetable',
       'Western', 'photo_y', 'item_name', 'menu_name', 'item_price_amt',
       'item_description', 'menu_active_flag', 'outlet_name'],
      dtype='object')

In [4]:
def load_stop_words():
    nltk.download('stopwords')
    with open("stopwords-id.txt") as fh:
        lines = fh.readlines()
        words = [l.strip() for l in lines]
    return set(words)

stopwords_id = load_stop_words()
stemmer = MPStemmer()

def preprocess(txt):
    txt = re.sub(r'\W+', ' ', txt)
    txt = txt.lower()
    txt = txt.replace("[^a-zA-Z]", " ")
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(txt)  
    filtered_words = [w for w in tokens if len(w) > 2 and w not in stopwords_id and w not in stopwords.words()]
    stemmed_words = [stemmer.stem(w) for w in filtered_words]
    if stemmed_words:
        return " ".join(stemmed_words)
    elif filtered_words:
        return " ".join(filtered_words)
    elif tokens:
        return " ".join(tokens)
    else:
        return txt

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/santhosh.mohan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
train_meta_df['item_description'] = train_meta_df['item_description'].fillna('')
train_meta_df['menu_name'] = train_meta_df['menu_name'].fillna('')
train_meta_df['outlet_name'] = train_meta_df['outlet_name'].fillna('')
train_meta_df['name'] = train_meta_df['name']+ ' '+ train_meta_df['item_description']
train_meta_df['price'] = np.log2(train_meta_df['price'])
train_meta_df['name'] = train_meta_df['name'].apply(lambda x: preprocess(x))
train_meta_df['menu_name'] = train_meta_df['menu_name'].apply(lambda x: preprocess(x))
train_meta_df['outlet_name'] = train_meta_df['outlet_name'].apply(lambda x: preprocess(x))



test_meta_df['item_description'] = test_meta_df['item_description'].fillna('')
test_meta_df['menu_name'] = test_meta_df['menu_name'].fillna('')
test_meta_df['outlet_name'] = test_meta_df['outlet_name'].fillna('')
test_meta_df['name'] = test_meta_df['name']+ ' '+ test_meta_df['item_description']
test_meta_df['price'] = np.log2(test_meta_df['price'])
test_meta_df['name'] = test_meta_df['name'].apply(lambda x: preprocess(x))
test_meta_df['menu_name'] = test_meta_df['menu_name'].apply(lambda x: preprocess(x))
test_meta_df['outlet_name'] = test_meta_df['outlet_name'].apply(lambda x: preprocess(x))
test_meta_df.head()

Unnamed: 0,item_id,photo_x,name,price,photo_y,item_name,menu_name,item_price_amt,item_description,menu_active_flag,outlet_name
0,27747868,b058aec5-244a-4d4d-9c3c-de4e7a705c2e.jpg,ayam goreng,13.425216,b058aec5-244a-4d4d-9c3c-de4e7a705c2e.jpg,Ayam Goreng,makan,11000.0,,True,penyet kobis kaliurang
1,34405470,2d1fc8be-6ede-4beb-b38a-14663bff53b8.jpg,sop iga sapi mengggunakan iga sapi pilih padu ...,14.872675,2d1fc8be-6ede-4beb-b38a-14663bff53b8.jpg,Sop Iga Sapi,sop,30000.0,Mengggunakan Iga Sapi Pilihan Dipadukan Deng...,True,sate domba rahmah khas betaw jatiwarna
2,28552600,5060313c-1e8c-48e3-8c30-596d85b087ee.jpg,bebek cabe ijo,14.580494,5060313c-1e8c-48e3-8c30-596d85b087ee.jpg,Bebek Cabe Ijo,menu,24500.0,,True,bebek kantor wtc rpong
3,34350193,1088feab-9cd7-44e3-ad1e-c8d3e6203d72.jpg,paket menaskot ikan nila,14.773139,1088feab-9cd7-44e3-ad1e-c8d3e6203d72.jpg,Paket Naskot Ikan Nila\t,nasi kotak,28000.0,,True,ayam bakar bogor
4,28009727,5c19ddfe-9569-4ea9-a404-846ba2c4ee21.jpg,ayam goreng,14.251187,5c19ddfe-9569-4ea9-a404-846ba2c4ee21.jpg,Ayam Goreng,lauk pauk,21000.0,,True,restoran garuda adam malik


In [6]:
train_meta_df = train_meta_df.drop(['photo_y', 'item_name','item_price_amt','item_description','menu_active_flag'],axis=1)
test_meta_df = test_meta_df.drop(['photo_y', 'item_name','item_price_amt','item_description','menu_active_flag'],axis=1)

In [7]:
new_df = pd.concat([train_meta_df,test_meta_df])
new_price = (new_df['price'] - new_df['price'].min()) / (new_df['price'].max() - new_df['price'].min()) 
train_meta_df["price"] = new_price.iloc[0:len(train_meta_df)]
test_meta_df["price"] = new_price.iloc[len(train_meta_df):]

In [8]:
test_meta_df.head()

Unnamed: 0,item_id,photo_x,name,price,menu_name,outlet_name
0,27747868,b058aec5-244a-4d4d-9c3c-de4e7a705c2e.jpg,ayam goreng,0.409342,makan,penyet kobis kaliurang
1,34405470,2d1fc8be-6ede-4beb-b38a-14663bff53b8.jpg,sop iga sapi mengggunakan iga sapi pilih padu ...,0.580614,sop,sate domba rahmah khas betaw jatiwarna
2,28552600,5060313c-1e8c-48e3-8c30-596d85b087ee.jpg,bebek cabe ijo,0.546041,menu,bebek kantor wtc rpong
3,34350193,1088feab-9cd7-44e3-ad1e-c8d3e6203d72.jpg,paket menaskot ikan nila,0.568836,nasi kotak,ayam bakar bogor
4,28009727,5c19ddfe-9569-4ea9-a404-846ba2c4ee21.jpg,ayam goreng,0.507076,lauk pauk,restoran garuda adam malik


In [9]:
x = train_meta_df
y = train_meta_df.iloc[:,4:52]


msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
for train_index, test_index in msss.split(x, y):
    x_train, x_test = x.iloc[train_index,:], x.iloc[test_index,:]

In [10]:
x_train = x_train.iloc[:,[i for i in range(0,4)]+[i for i in range(52,54)]+[i for i in range(4,52)]]
x_test = x_test.iloc[:,[i for i in range(0,4)]+[i for i in range(52,54)]+[i for i in range(4,52)]]
x_train.to_csv('data/training_data.csv',index=False)
x_test.to_csv('data/validation_data.csv',index=False)
test_meta_df.to_csv('data/test_data.csv',index=False)

In [11]:
x_train.head()

Unnamed: 0,item_id,photo_x,name,price,menu_name,outlet_name,Asian,Babi,Bakso,Bebek,...,Set_Menu,Sharing,Siomay,Snack_Appetizer,Soup,Sour,Spicy,Tea,Vegetable,Western
0,33409629,f72b2054-cc91-412d-92f9-cb2ab5c45ff4.jpg,tongseng sapi,0.64983,recommended,sate luwes menu lengkap jam sukabumi,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,30136906,75fb2afb-43f3-4686-a618-bbd1828a444d.jpg,rendang beef pepper rice,0.638053,makan,acakadut,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33854213,19df908f-5b03-4767-a379-eeec0a08c5d0.jpg,pisang crispy keju,0.409342,makan,erni burger pimpin,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,28333063,da0e8ead-e0d7-427b-bb56-ae82f377baf0.jpg,bubur ayam,0.54949,menu,bubur ayam akiong muara karang,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,28171697,4e9585b4-f0e6-4440-8d06-347c145b79c4.jpg,bebek dower saos telor asin,0.638053,alacart,bebek dower pejaten village,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [20]:
train_df = pd.read_csv('data/training_data.csv')
train_df.drop(train_df[train_df['menu_name'].isnull()].index, inplace=True)
train_df.drop(train_df[train_df['outlet_name'].isnull()].index, inplace=True)
train_df.reset_index(inplace=True)
train_df.to_csv('data/training_data.csv',index=False)

train_df = pd.read_csv('data/validation_data.csv')
train_df.drop(train_df[train_df['menu_name'].isnull()].index, inplace=True)
train_df.drop(train_df[train_df['outlet_name'].isnull()].index, inplace=True)
train_df.reset_index(inplace=True)
train_df.to_csv('data/validation_data.csv',index=False)
