In [1]:
import pandas as pd
import json
import re
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [2]:
#Import train data into dictionary
jsonfile = r'.\Data\train\train.json'
with open(jsonfile) as train_json:
    json_dict = json.load(train_json)

In [3]:
# Converting train data into data frame
sno = []
cuisine = []
ingredients = []
for i in range(len(json_dict)):
    sno.append(json_dict[i]['id'])
    cuisine.append(json_dict[i]['cuisine'])
    ingredients.append(json_dict[i]['ingredients'])

In [4]:
df = pd.DataFrame({'id':sno, 
                   'cuisine':cuisine, 
                   'ingredients':ingredients})

In [5]:
#Lemmatize the ingredients field
df['ingredients_mod'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in df['ingredients']]

In [6]:
# Adding Veg/Non-Veg information into train data set
mylist =['fish', 'goat', 'chicken','beef','pork','prawn','egg','Katsuobushi','mackrel','fillet','lamb','steak','salmon','shrimp','bacon','ham','turkey','duck','seafood','squid']
pattern = '|'.join(mylist)
df['veg']=df.ingredients_mod.str.contains(pattern) 
df.loc[df.veg == True,'veg'] = 'non-vegetarian'
df.loc[df.veg == False,'veg'] = 'vegetarian'

In [7]:
#Removing the stop words from ingredients and vectorizing
vectorizer = TfidfVectorizer(stop_words='english',
                             ngram_range = ( 1 , 1 ),analyzer="word", 
                             max_df = .57 , binary=False , token_pattern=r'\w+' , sublinear_tf=False)
train_tfidf=vectorizer.fit_transform(df['ingredients_mod'])

X = train_tfidf
Y = df['cuisine']
Z = df['veg']

In [8]:
# Test and Train split for testing the models
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 100)

In [9]:
#Naive Bayes
clf = MultinomialNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred)*100)
print("F1 Score:",f1_score(y_test, y_pred, average="macro")*100)
print("Precision:",precision_score(y_test, y_pred, average="macro")*100)
print("Recall:",recall_score(y_test, y_pred, average="macro")*100)
print (classification_report(y_test,y_pred))

Accuracy: 66.91389063482087
F1 Score: 45.424858003051035
Precision: 81.01351153935535
Recall: 41.910136887204814
              precision    recall  f1-score   support

   brazilian       0.94      0.17      0.29        87
     british       0.83      0.09      0.16       168
cajun_creole       0.75      0.53      0.62       288
     chinese       0.58      0.90      0.71       514
    filipino       0.80      0.11      0.20       142
      french       0.53      0.42      0.47       532
       greek       0.95      0.34      0.50       226
      indian       0.76      0.90      0.82       609
       irish       0.83      0.04      0.07       135
     italian       0.65      0.89      0.75      1588
    jamaican       1.00      0.15      0.26       115
    japanese       0.92      0.58      0.71       271
      korean       0.98      0.29      0.44       182
     mexican       0.81      0.92      0.86      1294
    moroccan       0.94      0.29      0.45       173
     russian       1.0

In [10]:
#Logistic Regerssion
logisticReg = LogisticRegression()
logisticReg.fit(X_train, y_train)
y_pred = logisticReg.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred)*100)
print("F1 Score:",f1_score(y_test, y_pred, average="macro")*100)
print("Precision:",precision_score(y_test, y_pred, average="macro")*100)
print("Recall:",recall_score(y_test, y_pred, average="macro")*100)
print (classification_report(y_test,y_pred))

Accuracy: 77.57385292269014
F1 Score: 69.16154452119306
Precision: 77.7810333107795
Recall: 64.48408468066657
              precision    recall  f1-score   support

   brazilian       0.81      0.39      0.53        87
     british       0.73      0.35      0.47       168
cajun_creole       0.74      0.69      0.71       288
     chinese       0.77      0.86      0.81       514
    filipino       0.76      0.56      0.65       142
      french       0.59      0.61      0.60       532
       greek       0.81      0.64      0.71       226
      indian       0.88      0.89      0.89       609
       irish       0.76      0.41      0.54       135
     italian       0.77      0.90      0.83      1588
    jamaican       0.96      0.67      0.79       115
    japanese       0.84      0.69      0.76       271
      korean       0.86      0.71      0.78       182
     mexican       0.89      0.93      0.91      1294
    moroccan       0.83      0.76      0.80       173
     russian       0.69  

In [11]:
#SVM
lsvc_clf = svm.LinearSVC(C=1)
lsvc_clf.fit(X_train, y_train)
y_pred=lsvc_clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred)*100)
print("F1 Score:",f1_score(y_test, y_pred, average="macro")*100)
print("Precision:",precision_score(y_test, y_pred, average="macro")*100)
print("Recall:",recall_score(y_test, y_pred, average="macro")*100)
print (classification_report(y_test,y_pred))

Accuracy: 78.70521684475172
F1 Score: 71.12273714075951
Precision: 75.21561421683064
Recall: 68.30101874617486
              precision    recall  f1-score   support

   brazilian       0.81      0.55      0.66        87
     british       0.68      0.46      0.55       168
cajun_creole       0.71      0.70      0.71       288
     chinese       0.79      0.87      0.82       514
    filipino       0.64      0.58      0.61       142
      french       0.61      0.63      0.62       532
       greek       0.78      0.70      0.74       226
      indian       0.87      0.88      0.87       609
       irish       0.65      0.50      0.57       135
     italian       0.81      0.89      0.85      1588
    jamaican       0.92      0.72      0.81       115
    japanese       0.84      0.73      0.78       271
      korean       0.81      0.76      0.78       182
     mexican       0.91      0.93      0.92      1294
    moroccan       0.80      0.79      0.79       173
     russian       0.59 

In [12]:
#Importing the BBC data which is to be classified
file = r'.\Data\test\test.csv'
df_test = pd.read_csv(file,encoding='ISO-8859-1')
#Lemmatizing the ingredients column and vectorizing
df_test['ingredients_mod'] = [WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in df_test['Ingredients']]       
test_tfidf=vectorizer.transform(df_test['ingredients_mod'])

In [13]:
#Using SVM to classify the BBC dataset
lsvc_cuisine = svm.LinearSVC(C=1)
lsvc_veg = svm.LinearSVC(C=1)
lsvc_cuisine.fit(X,Y)
lsvc_veg.fit(X,Z)
Cuisine_pred=lsvc_cuisine.predict(test_tfidf)
Type_pred=lsvc_veg.predict(test_tfidf)

In [14]:
df_test['cuisine'] =  Cuisine_pred
df_test['Veg-NonVeg'] =  Type_pred
Result = pd.DataFrame({'Recipe Image':df_test['Recipe-image-src'],'Recipe':df_test['Recipe'], 'Cuisine':df_test['cuisine'].str.capitalize(), 'Veg-NonVeg':df_test['Veg-NonVeg'].str.capitalize(),'Cook Time':df_test['cook time'],'Effort':df_test['Effort'],'Ingredients':df_test['Ingredients'],'Cooking Method':df_test['Recipe-method'],'Recipe URL':df_test['Recipe-url']})
print(Result)

                                          Recipe Image  \
0    //www.bbcgoodfood.com/sites/default/files/styl...   
1    //www.bbcgoodfood.com/sites/default/files/styl...   
2    //www.bbcgoodfood.com/sites/default/files/styl...   
3    //www.bbcgoodfood.com/sites/default/files/styl...   
4    //www.bbcgoodfood.com/sites/default/files/styl...   
5    //www.bbcgoodfood.com/sites/default/files/styl...   
6    //www.bbcgoodfood.com/sites/default/files/styl...   
7    //www.bbcgoodfood.com/sites/default/files/styl...   
8    //www.bbcgoodfood.com/sites/default/files/styl...   
9    //www.bbcgoodfood.com/sites/default/files/styl...   
10   //www.bbcgoodfood.com/sites/default/files/styl...   
11   //www.bbcgoodfood.com/sites/default/files/styl...   
12   //www.bbcgoodfood.com/sites/default/files/styl...   
13   //www.bbcgoodfood.com/sites/default/files/styl...   
14   //www.bbcgoodfood.com/sites/default/files/styl...   
15   //www.bbcgoodfood.com/sites/default/files/styl...   
16   //www.bbc

In [15]:
data1 = Result.iloc[0:len(Result)-1]
data2 = Result.iloc[[len(Result)-1]]
data1.to_csv('output.csv', index = False)
data2.to_csv('output.csv', index = False, header= False,mode='a',line_terminator="")