In [10]:
# import libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Remi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [None]:
def clean_data(train, test):
    # clean data
    train['ingredients_clean_string'] = [' , '.join(z).strip() for z in train['ingredients']]
    test['ingredients_clean_string'] = [' , '.join(z).strip() for z in test['ingredients']]
    
    # lemmatize ingredients  
    train['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) 
                                             for line in lists]).strip() for lists in train['ingredients']]       
    test['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) 
                                              for line in lists]).strip() for lists in test['ingredients']]  

In [11]:
def preprocessing():
    # JSON to dataframe
    train = pd.read_json("./data/train.json")
    test = pd.read_json("./data/test.json") 
    
    # call function to clean data
    clean_data(train, test)
    
    # create corpus based clean data
    train_corpus = train['ingredients_string']
    test_corpus = test['ingredients_string']
    
    return train_corpus, test_corpus

In [12]:
train_corpus, test_corpus = preprocessing()

            cuisine     id                                        ingredients  \
0             greek  10259  [romaine lettuce, black olives, grape tomatoes...   
1       southern_us  25693  [plain flour, ground pepper, salt, tomatoes, g...   
2          filipino  20130  [eggs, pepper, salt, mayonaise, cooking oil, g...   
3            indian  22213                [water, vegetable oil, wheat, salt]   
4            indian  13162  [black pepper, shallots, cornflour, cayenne pe...   
5          jamaican   6602  [plain flour, sugar, butter, eggs, fresh ginge...   
6           spanish  42779  [olive oil, salt, medium shrimp, pepper, garli...   
7           italian   3735  [sugar, pistachio nuts, white almond bark, flo...   
8           mexican  16903  [olive oil, purple onion, fresh pineapple, por...   
9           italian  12734  [chopped tomatoes, fresh basil, garlic, extra-...   
10          italian   5875  [pimentos, sweet pepper, dried oregano, olive ...   
11          chinese  45887  

In [13]:


     



# convert a collection of raw documents to a matrix of TF-IDF features
train_vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
test_vectorizer = TfidfVectorizer(stop_words='english')

# transform the corpus to a dense matrix representation
train_tfidf=train_vectorizer.fit_transform(train_corpus).todense()
test_tfidf=train_vectorizer.transform(test_corpus)


# prepare data for prediction
train_predictor = train_tfidf
test_predictor = test_tfidf

train_target = train['cuisine']


# build Linear Support Vector Classification model
# set penalty parameter as 0.8 with standard penaliation l2
# select the algorithm to solve primal optiomization problem
classifier = LinearSVC(C=0.80, penalty="l2", dual=False)

# model = LinearSVC()
model = LogisticRegression()

# process exhaustive search over specified parameter values for the model
parameters = {'C':[1, 10]}
classifier = GridSearchCV(model, parameters)

# fit classification model to data
classifier=classifier.fit(train_predictor,train_target)

# make prediction
prediction=classifier.predict(test_predictor)

# assign predicted values to cuisine in TEST set
test['cuisine'] = prediction

# write csv file (no index for submission)
test[['id','cuisine' ]].to_csv("LogisticRegression.csv",index=False)

# model.score(X,Y)



