In [1]:
import pandas as pd

In [2]:
train_path = './train.json'
data = pd.read_json(train_path)

pd.set_option("max_colwidth", 200)

data.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles]"
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, ground black pepper, thyme, eggs, green tomatoes, yellow corn meal, milk, vegetable oil]"
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, green chilies, grilled chicken breasts, garlic powder, yellow onion, soy sauce, butter, chicken livers]"
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pepper, onions, garlic paste, milk, butter, salt, lemon juice, water, chili powder, passata, oil, ground cumin, boneless chicken skinless thigh, garam m..."


In [3]:
from sklearn.model_selection import train_test_split

data = data.drop(columns=['id'])

X = data['ingredients']
y = data['cuisine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [4]:
from sklearn.base import BaseEstimator, ClassifierMixin

class BaselineClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        return ["italian"] * X.shape[0]
    
    def score(self, X, y):
        return sum(self.predict(X) == y) / X.shape[0]

In [5]:
baseline = BaselineClassifier()

print(1 - baseline.score(X_train, y_train))
print(1 - baseline.score(X_test, y_test))

0.8029164964329489
0.8030169704588309


In [6]:
from collections import Counter

ingredients_counter = Counter(ingredient for ingredients_list in X
                              for ingredient in ingredients_list)

print("Broj jedinstvenih sastojaka", len(ingredients_counter))

Broj jedinstvenih sastojaka 6714


In [7]:
# trebat ce mi matrica len(train) x broj_jedinstvenih_sastojaka -> 39774 x 6714 = 267042636 ~ 2 * 10^8

In [8]:
ingredientToInd = dict([(y, x) for x, y in enumerate(ingredients_counter)])

from scipy.sparse import lil_matrix

def create_cnt_matrix(ingredients_data):
    cnt_matrix = lil_matrix((len(ingredients_data), len(ingredients_counter)), dtype=bool, copy=False)

    for i, row in enumerate(ingredients_data):
        for ingredient in row:
            cnt_matrix[i, ingredientToInd[ingredient]] = 1
            
    return cnt_matrix

cnt_matrix_train = create_cnt_matrix(X_train)

In [9]:
%%time

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=5, n_estimators=1000, max_features=1000, n_jobs=-2)

classifier.fit(cnt_matrix_train, y_train)

CPU times: user 1min 13s, sys: 161 ms, total: 1min 13s
Wall time: 11.2 s


RandomForestClassifier(max_depth=5, max_features=1000, n_estimators=1000,
                       n_jobs=-2)

In [10]:
%%time

classifier.score(cnt_matrix_train, y_train)

CPU times: user 15 s, sys: 197 ms, total: 15.2 s
Wall time: 3.47 s


0.4061724127093875

In [11]:
%%time

classifier.score(create_cnt_matrix(X_test), y_test)

CPU times: user 3.52 s, sys: 88.7 ms, total: 3.61 s
Wall time: 1.2 s


0.40150848522941546