In [None]:
# load datea libraries
import numpy as np # linear algebra library
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.sparse import csr_matrix
import zipfile # to read zip files
import matplotlib.pyplot as plot

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Logistic Regression 

In [None]:
   
class LogisticRegression():

    def __init__(self, X, Y, num_label, learning_rate, iter, lmda):
        self.X = X  #(N, K)
        self.Y = Y  #(N, 1)
        self.learning_rate = learning_rate
        self.iter = iter
        self.lmda = lmda
        self.N = len(Y)
        self.num_K = X.shape[1]
        self.num_label = num_label
        #self.W = np.zeros(self.num_K)   #(K, 1)
        
        self.W_all = np.zeros((self.num_label, self.num_K))
        

    def make_random(self):
        return np.random.uniform(low = -5, high = 5)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def softmax(X):
        X_exp = np.exp(X)
        partition = X_exp.sum(1, keepdims=True)
        return X_exp / partition  # The broadcasting mechanism is applied here

    def cost_function(self):
        h = self.sigmoid(np.dot(self.X, self.W))  # ( N, 1 )
       # print(f'h.shape = {h.shape}, h={h}')
       # print(h.shape)
        cost = -(np.sum(self.Y * np.log(h) + (1-self.Y) * np.log(1-h)))
        return cost

    def gradient_descent(self, X, Y, lmda):
        m = len(Y)
        #W = np.random.uniform(low = -1, high = 1, size=self.num_K)   #(K, 1)
        W = np.zeros(self.num_K)
        grad = np.zeros(self.num_K) 
        costs = np.zeros(self.iter)
        
        for i in range(self.iter):
            # y^hat = sigmoid(XW)
            y_hat = self.sigmoid(X.dot(W))  
            
            # cost function
            cost = -((np.sum(Y * np.log(y_hat) + (1-Y) * np.log(1-y_hat))) * (1/m))
            
            #  calculate gradient
            grad = X.T.dot(y_hat - Y) * (1/m)   
            
            # Move parameters in direction of gradient
            W = W - self.learning_rate * grad             
            costs[i] = cost     
            
        print(f'W = {W}')      
        return W, costs

    def fit(self):
        costs_label = np.zeros((self.num_label, self.iter))
        for l in range(self.num_label):
            self.W_all[l], costs = self.gradient_descent(self.X, Y == l, self.lmda)
            costs_label[l] = costs
            print(f'{l}th label')
        
        return self.W_all, costs_label

    def pridict(self, x):
        return self.sigmoid(np.dot(x, self.W_all.T))

# Data preprocessing

In [None]:
#unzip the files
archive_train = zipfile.ZipFile('/kaggle/input/whats-cooking/train.json.zip')
archive_test = zipfile.ZipFile('/kaggle/input/whats-cooking/test.json.zip')

#read training json file 
train_data = pd.read_json(archive_train.read('train.json'))
test_data = pd.read_json(archive_test.read('test.json'))
#output the frist 5 rows
train_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
ingredient_dict = dict()
for i in range(len(train_data)):
    ingredients = train_data["ingredients"][i]
    for ingredient in ingredients:
        if ingredient not in ingredient_dict.keys():
            ingredient_dict[ingredient] = 1
        else:
            ingredient_dict[ingredient] += 1

In [None]:
ingredient_list = list()
for i in range(len(train_data)):
    ingredients = train_data.loc[i].ingredients
    for ingredient in ingredients:
        if not ingredient in ingredient_list:
            ingredient_list.append(ingredient)

for i in range(len(test_data)):
    ingredients = test_data.loc[i].ingredients
    for ingredient in ingredients:
        if not ingredient in ingredient_list:
            ingredient_list.append(ingredient)
            

In [None]:
ingredient_size = len(ingredient_list)
print(ingredient_size)

In [None]:
def encodeInput(input):
    input_array = np.zeros(ingredient_size)
    for ingredient in input:
        for i in range(ingredient_size):
            if ingredient in ingredient_list[i]:
                input_array[i] += 1
    return input_array
        

In [None]:
myfood = {'salt', 'black olives', 'kimchi'}
encoded = encodeInput(myfood)
print(encoded)

In [None]:
X_df = pd.DataFrame(0, index=np.arange(len(train_data)),columns=ingredient_list)

for row in range(len(train_data)):
    ingredients = train_data.loc[row].ingredients
    for ingredient in ingredients:
        X_df.loc[row, ingredient] += 1

In [None]:
test_df = pd.DataFrame(0, index=np.arange(len(test_data)), columns=ingredient_list)

for row in range(len(test_data)):
    ingredients = test_data.loc[row].ingredients
    for ingredient in ingredients:
        test_df.loc[row, ingredient] += 1

In [None]:
cuisine_list = list()
for i in train_data['cuisine']:
    if i not in cuisine_list:
        cuisine_list.append(i)

In [None]:
Y = np.zeros(len(train_data))
num_label = len(cuisine_list)
for i in range(len(train_data)):
    cuisine = train_data['cuisine'][i]
    for j in range(num_label):
        if cuisine == cuisine_list[j]:
            Y[i] = j


In [None]:
# test = X_df.loc[0]
# X = np.array(X_df)
# prob_vec = np.zeros(num_label)
# for label in range(num_label):
#     Y_converted = (Y==label)
#     lr = LogisticRegression(X, Y_converted, 0.1, 100)
#     lr.gradient_descent()
#     prob_vec[label] = lr.predict(test)

# print(prob_vec)

In [None]:
# test = X_df.loc[0]
# X = np.array(X_df)

# lr = LogisticRegression(X, Y_converted, 0.1, 100)
# lr.gradient_descent()
# print(lr.predict(test))

# Training

In [None]:
test = X_df.loc[0]
X = np.array(X_df)
X_csr = csr_matrix(X)

lr = LogisticRegression(X_csr, Y, num_label, 0.3, 2000, 0.1)
W, costs = lr.fit()

#print(cuisine_list[np.argmax(lr.pridictOneVsAll(np.array(X_df.loc[7])))])

In [None]:
print(costs[10])

In [None]:
plot.plot(costs[10])
plot.xlabel('iteration')
plot.ylabel('cost')

In [None]:
test_array = np.array(test_df)
pred = lr.pridict(np.array(test_array))
pred_cuisine = [cuisine_list[i] for i in np.argmax(pred, axis=1)]

submission = pd.DataFrame(data=pred_cuisine, columns=['cuisine'])
submission['id'] = test_data['id']
submission.set_index("id",inplace=True)
submission.head()                                

In [None]:
W_Df = pd.DataFrame(W, index=cuisine_list, columns=ingredient_list)

W_ranking = list()
for i in W_Df.index:
    W_ranking.append(W_Df.loc[i].sort_values(ascending=False)[:10].index)
    
W_ranking = pd.DataFrame(W_ranking, index=cuisine_list)

In [None]:
print(W_Df.index)

# Predict cuisine with ingredient input

In [None]:
def predict_cuisine(input):
    input_encoded = encodeInput(input)
    pred = cuisine_list[np.argmax(lr.pridict(input_encoded))]
    return pred

In [None]:
myfood = ['red pepper','garlic','pasta','tomatoes','salt']
pred = predict_cuisine(myfood)
print(pred)

In [None]:
myfood = ['udon','dashi stock powder','soy sauce','mirin','salt','sugar']
pred = predict_cuisine(myfood)
print(pred)

In [None]:
submission.to_csv('submission.csv')

In [None]:
W_Df.to_csv('W.csv')
W_ranking.to_csv('W_ranking.csv')