In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


import sys
np.set_printoptions(threshold=sys.maxsize)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/snehgajiwala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#A function for creating a bag of words from our data
def getWords(X):
    documents = []

    stemmer = WordNetLemmatizer()

    for sen in range(0, len(X)):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(X[sen]))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
        
    return(documents)

In [3]:
#Reading the data and printing the shape, head (some food items) and tail (some drink items)
data = pd.read_csv("Menu.csv")
print(data.shape)
display(data.head())
display(data.tail())

(998, 2)


Unnamed: 0,Item Name,Category
0,L-Cup Soup,Food
1,L-Bowl Soup,Food
2,L-Soup/Salad,Food
3,L-Grace's Sal,Food
4,inn salad,Food


Unnamed: 0,Item Name,Category
993,**HH Cab,Drink
994,CAB Ladera,Drink
995,SAUV BL Joel Got,Drink
996,CAB Joel Gott815,Drink
997,Bousquet Rose,Drink


In [4]:
#Creating separate lists for item names and categories
Item_Names = data.drop('Category', axis=1)
Categories = data['Category']

In [5]:
#getting the bag of words for our items
documents_train = getWords(Item_Names['Item Name'])

In [6]:
#creating a vectorizer and tf-idf transformer
vectorizer = CountVectorizer(min_df=1)
tfidfconverter = TfidfTransformer()

In [7]:
#transforming and vectorizing item names
Item_Names = vectorizer.fit_transform(documents_train).toarray()
Item_Names = tfidfconverter.fit_transform(Item_Names).toarray()

In [8]:
#Saving the vectorized item names as inputs and category names as output to the model
X_train = Item_Names
y_train = Categories

In [9]:
#Creating the classifier and training the model
classifier = RandomForestClassifier(n_estimators=10, random_state=None);
classifier.fit(X_train, y_train);

In [10]:
#creating a random test set to check the acciracy of our model
test_items = ['Lite Beer','Chicken Pasta','Sour Whiskey','Bloody Mary', 'Cobb Salad', 'Meatball Spaghetti',
             'Screwdriver', 'Beef Rice', 'Spl Taco', 'Long Island Ice Tea', 'Miller Lite', 
             'Cheesecake','Bousquet Rose']
test_items_categories = ['Drink', 'Food', 'Drink', 'Drink','Food','Food','Drink','Food','Food','Drink','Drink',
          'Food','Drink']

In [11]:
# getting the bag of words for our test items
documents_test = getWords(test_items)

In [12]:
#transforming and vectorizing test item names
test_items = vectorizer.transform(documents_test).toarray()
test_items = tfidfconverter.transform(test_items).toarray()

In [13]:
#generating predictions
predicted_categories = classifier.predict(test_items)

In [14]:
# printing the accuracy
print(accuracy_score(test_items_categories,predicted_categories))

0.9230769230769231
