In [78]:
# import libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [79]:
def clean_data(train, test):
    # clean data
    train['ingredients_clean_string'] = [' , '.join(z).strip() for z in train['ingredients']]
    test['ingredients_clean_string'] = [' , '.join(z).strip() for z in test['ingredients']]
    
    # lemmatize ingredients  
    train['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in train['ingredients']]       
    test['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in test['ingredients']]  

In [92]:
def preprocessing():
    # split data into training and testing dataset
    data = pd.read_json("./data/train.json")
    train, test = train_test_split(data, test_size=0.2, random_state=4381)
    
    # ground truth
    true_val = test['cuisine']
    
    # call function to clean data
    clean_data(train, test)
    
    return train, test, true_val

In [93]:
def vectorize(train, test):
    # create corpa based clean data
    train_corpus = train['ingredients_string']
    test_corpus = test['ingredients_string']
    
    # convert ingredients to matrix of TF-IDF features
    # ngram_range = # of words in a sequence
    # max_df = max document frequency, ignore words that exceed this frequency
    # token pattern = regexp used, mandatory if analyzer='word'
    train_vectorizer = TfidfVectorizer(stop_words='english', ngram_range = ( 1 , 1 ),analyzer="word", max_df = .5, token_pattern=r'\w+')

    # return document term matrices fit on respective corpa
    train_tfidf = train_vectorizer.fit_transform(train_corpus).todense()
    test_tfidf = train_vectorizer.transform(test_corpus)
    
    return train_tfidf, test_tfidf

In [107]:
def logistic_regression(train_predictor, train_target, test_predictor, true_val):
    model = LogisticRegression()

    # process exhaustive search over specified parameter values for the model
    # do for num_folds
    num_folds = 10
    parameters = {'C':[1, 10]}
    classifier = GridSearchCV(model, parameters, cv=num_folds)

    # fit classification model to data
    classifier = classifier.fit(train_predictor,train_target)

    # make prediction
    prediction = classifier.predict(test_predictor)
    
    # test model accuracy
    print(accuracy_score(true_val, prediction))
    print(classification_report(true_val, prediction))

In [108]:
# run all functions
train, test, true_val = preprocessing()
train_predictor, test_predictor = vectorize(train, test)
train_target = train['cuisine']
logistic_regression(train_predictor, train_target, test_predictor, true_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See



0.7914519170333124
              precision    recall  f1-score   support

   brazilian       0.81      0.48      0.61        95
     british       0.54      0.45      0.49       150
cajun_creole       0.74      0.71      0.73       313
     chinese       0.83      0.87      0.85       554
    filipino       0.78      0.66      0.71       151
      french       0.60      0.61      0.60       540
       greek       0.83      0.74      0.78       224
      indian       0.87      0.91      0.89       596
       irish       0.61      0.48      0.54       128
     italian       0.81      0.89      0.85      1564
    jamaican       0.84      0.67      0.75       104
    japanese       0.85      0.74      0.79       255
      korean       0.86      0.79      0.82       160
     mexican       0.90      0.92      0.91      1298
    moroccan       0.81      0.75      0.78       159
     russian       0.64      0.46      0.53       102
 southern_us       0.70      0.79      0.74       887
     spa