In [1]:
import pandas as pd
import numpy as np

# Deep learning
import torch
import torch.nn as nn

# NLP imports
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# viz
import matplotlib.pyplot as plt
import seaborn as sns

# misc imports
from ast import literal_eval
import glob
import pickle

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Provide root path to dataset
dataset_root = '/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/NER/'

### Data Pre processing

In [4]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# Add Kaggle validation dataset
file_list = glob.glob(dataset_root + '/Kaggle/kaggle_subsample_*.csv')

dfs = []
for file in file_list:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
val_reviews_df = pd.concat(dfs, ignore_index=True)
# filter relevant columns
val_reviews_df = val_reviews_df[['Restaurant', 'Review', 'Rating', 'ner_results']]
# Drop reviews with missing annotations
val_reviews_df = val_reviews_df.dropna(subset=['ner_results', 'Review'])
val_reviews_df.head()

Unnamed: 0,Restaurant,Review,Rating,ner_results
0,PourHouse7,Pocket-friendly place for a quick round of dri...,3.0,"{'food': ['Chicken', 'Fish', 'veg'], 'drink': ..."
1,PourHouse7,Visited the clubbing section of the resturant ...,5.0,"{'food': ['Chinken Nazakat', 'butter garlic pr..."
2,PourHouse7,Went for a friend’s birthday celebration. The ...,4.0,"{'food': ['veg white sauce pasta', 'biryani'],..."
4,PourHouse7,Ambience was great.. food was good too.. servi...,5.0,"{'food': ['food was good too'], 'drink': ['dri..."
5,PourHouse7,"Loud music, average food and untrained staff s...",2.0,"{'food': ['veg hakka noodle', 'special pizza']..."


In [6]:
# Function to convert text into (word, pos_tag, ner_label) tuples
def convert_text_to_tuples(text, ner_results):

    # print(f"review: {text}")
    # print(f"NER: {ner_results}")

    # lowercasing text
    text = text.lower()
    # Tokenize the text
    words = word_tokenize(text, language='english')
    # removing punctuation
    words = [word for word in words if word not in string.punctuation]

    # stop word removal
    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words]


    # Get POS tags for the words
    pos_tags = pos_tag(words)

    # Initialize list to store (word, pos_tag, ner_label) tuples
    word_tuples = []

    # Convert ner_results from string to dictionary
    ner_results_dict = literal_eval(ner_results)
    if type(ner_results_dict) != dict:
        print(f"caught unexpected NER type: {ner_results}")
        return word_tuples

    # Check only ('food', 'drink', 'None') exist in NER lables
    valid_NER_Lables = ['food', 'drink', 'None']
    for lable in ner_results_dict.keys():
        if lable not in valid_NER_Lables:
            # print(ner_results_dict)
            return word_tuples

    # Iterate through words and POS tags
    i = 0
    while i < len(words):
        # Check if the word is a named entity (food/drink item)
        ner_label = None
        for label, entities in ner_results_dict.items():
            # catch unexpected NER entities
            try:
                # lower case valid food/drink terms for word match
                entities_lower = [item.lower() for item in entities]
            except:
                print(f"caught unexpected NER: {ner_results}")
                return word_tuples

            # loop through each valid food/drink item
            for food_drink_item in entities:
                # check if given word is part of food/drink item
                possible_food_drink = ' '.join(words[i:i+len(food_drink_item.split())]).lower()
                # print(f"possible food/drink: {possible_food_drink}")
                if possible_food_drink in entities_lower:
                    # print(f"found a match {possible_food_drink}")
                    match_food_drink = possible_food_drink.split()
                    ner_label = label
                    # Add (word, pos_tag, ner_label) tuple for each word in the named entity
                    for j in range(len(match_food_drink)):
                        word_tuples.append((words[i+j], pos_tags[i+j][1], ner_label))
                    i += len(match_food_drink) - 1  # Skip the next words as they are part of the named entity
                    break
        if ner_label is None:
            # If not a named entity, add (word, pos_tag, None) tuple
            word_tuples.append((words[i], pos_tags[i][1], 'None'))
        i += 1
    return word_tuples


In [7]:
# Apply above to validation set!
val_reviews_df['word_tuples'] = val_reviews_df.apply(lambda row: convert_text_to_tuples(row['Review'], row['ner_results']), axis=1)

# remove empty word tuples
val_reviews_df = val_reviews_df[val_reviews_df['word_tuples'].apply(lambda x: len(x) != 0)]


### Feature Engineering

In [8]:
# Load word, tag lookups
with open(dataset_root + '/Model/word2idx.pickle', 'rb') as f:
    word2idx = pickle.load(f)

with open(dataset_root + '/Model/tag2idx.pickle', 'rb') as f:
    tag2idx = pickle.load(f)

with open(dataset_root + '/Model/idx2word.pickle', 'rb') as f:
    idx2word = pickle.load(f)

with open(dataset_root + '/Model/idx2tag.pickle', 'rb') as f:
    idx2tag = pickle.load(f)

In [9]:
# Get reviews in only (word, pos_tag, ner_tag) format from train/test set only
reviews_text = val_reviews_df['word_tuples'].values


In [10]:
# Convert words to indices for X
X_var = [[word2idx[w[0]] for w in s] for s in reviews_text]
tensor_reviews = [torch.as_tensor(x) for x in X_var]

### LSTM Model

In [11]:
class NER_LSTM(nn.Module):
    def __init__(self, input_dim, output_dim, embedding_dim=128, hidden_dim=100, dropout=0.1):
        super(NER_LSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        # create embeddings (vector representations for each word in text)
        # of dimesion: embedding_dim for num_words words a.k.a input_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # dropout layer during training
        self.dropout = nn.Dropout(p=dropout)

        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=dropout)

        # Fully connected layer
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # sentence shape: [sentence_length, batch_size]
        embedded = self.dropout(self.embedding(text))

        # LSTM input shape should be (sequence_length, batch_size, input_size)
        lstm_out, (hidden, cell) = self.lstm(embedded)

        #print(f"hidden layer shape: {hidden.shape}")
        #print(f"lstm_out layer shape: {lstm_out.shape}")
        #print(f"hidden 0 layer shape: {hidden[0].shape}")

        # Fully connected layer
        # predictions = self.linear(self.dropout(lstm_out))
        predictions = self.linear(lstm_out)

        return predictions

### Inference

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [13]:
torch.manual_seed(0)

# Load the trained model
if torch.cuda.is_available():
    model = torch.load(dataset_root + '/Model/entire_model.pt')
else:
    model = torch.load(dataset_root + '/Model/entire_model.pt', map_location=torch.device('cpu'))

# Load the model state dictionary
#loaded_model = NER_LSTM(45049, 3)
#loaded_model.load_state_dict(torch.load(dataset_root + 'Model/model.pth'))

model.to(device)

NER_LSTM(
  (embedding): Embedding(45049, 128)
  (dropout): Dropout(p=0.4, inplace=False)
  (lstm): LSTM(128, 100, batch_first=True, dropout=0.4)
  (linear): Linear(in_features=100, out_features=3, bias=True)
)

In [14]:
# perform NER inference
def Extract_NER(review_id, model, verbose_log = False):
    review_processed = []
    for word_id in tensor_reviews[review_id]:
        idx = word_id.item()
        review_processed.append(idx2word[idx])

    if verbose_log:
        print("Original Review:")
        print(val_reviews_df.iloc[review_id].Review)
        print("\n")
        print("Filtered Review:")
        print(" ".join(review_processed))
        print("\n")

    # eval model
    model.eval()

    # NER tagging
    predictions = model(tensor_reviews[review_id])
    # get class with highest probability
    preds_softmax = torch.log_softmax(predictions, dim = 1)
    pred_tags = torch.argmax(preds_softmax, dim = 1)

    result = {'food':[], 'drink':[]}

    for i, tag_id in enumerate(pred_tags):
        idx = tag_id.item()
        tag = idx2tag[idx]
        # print(tag)
        word_ind = tensor_reviews[review_id][i].item()
        if tag == 'food':
            if verbose_log:
                print(f"found food: {idx2word[word_ind]}")
            result['food'].append(idx2word[word_ind])
        elif tag == 'drink':
            if verbose_log:
                print(f"found drink: {idx2word[word_ind]}")
            result['drink'].append(idx2word[word_ind])

    if verbose_log:
        print("===============================\n")

    return result

# Get Top Dishes for a restaurant
def TopNDish(restaurant, model):
    filt_df = val_reviews_df[(val_reviews_df["Restaurant"] == restaurant)]
    food_score = {}
    drink_score = {}
    for idx in filt_df.index:
        # extract tags for given review
        ner_tags = Extract_NER(idx, model)
        # update food score
        for food in ner_tags['food']:
            if food in food_score:
                food_score[food] += 1
            else:
                food_score[food] = 1

        # update drink score
        for drink in ner_tags['drink']:
            if drink in drink_score:
                drink_score[drink] += 1
            else:
                drink_score[drink] = 1
    return { 'food_score' : food_score, 'drink_score' : drink_score}

In [15]:
# Get top 5 mentioned food items for Restaurant: Arena Eleven
dishes = TopNDish("Arena Eleven", model)

# sort food by count
top_food = sorted(dishes['food_score'].items(), key=lambda x: x[1], reverse=True)[:5]
# sort drink by count
top_drinks = sorted(dishes['drink_score'].items(), key=lambda x: x[1], reverse=True)[:5]

print("Top 5 mentioned Dishes:")
for dish, count in top_food:
    print(f"{dish}: {count}")

print("\nTop 5 mentioned Drinks:")
for drink, count in top_drinks:
    print(f"{drink}: {count}")

Top 5 mentioned Dishes:
donuts: 49
burger: 14
chicken: 8
burgers: 8
chocolate: 7

Top 5 mentioned Drinks:
coffee: 15
lemonade: 1
iced: 1
tea: 1


In [16]:
most_reviews_df = val_reviews_df[val_reviews_df["Restaurant"] == "Arena Eleven"]
# Get Reviews from Arena Eleven Restaurant and extract NER labels

for idx in most_reviews_df.index[30:50]:
    Extract_NER(idx, model, verbose_log=True)

Original Review:
Good


Filtered Review:
good



Original Review:
I've always loved Dunkin donuts.
We stopped by for a quick grab. Ordered for a combo. Cappuccino, cold coffee, iced tea , potato wedges with dip and 2 varieties of jelly filled donuts.

It was good and loved the beverages too.


Filtered Review:
've always loved dunkin donuts stopped quick grab ordered combo cappuccino cold coffee iced tea potato wedges dip 2 varieties jelly filled donuts good loved beverages


found food: donuts
found food: cappuccino
found drink: coffee
found drink: iced
found drink: tea
found food: potato

Original Review:
Awful burger!


Filtered Review:
awful burger


found food: burger

Original Review:
potato was not pealed it was not that good burger was good


Filtered Review:
potato pealed good burger good


found food: potato
found food: burger

Original Review:
worst food ever I had


Filtered Review:
worst food ever



Original Review:
Stopped in for a quick bite to tide us over till dinner.

### Failure Analysis

In [20]:
# Extract NER tags for Review no. 32
Extract_NER(32, model, verbose_log=True)

Original Review:
Here is my Review!

LOCATION:-
Very easy to locate, it is very near to IKEA, a prominent landmark in Hyderabad. Its on the 4th floor, also rooftop option is available in the evening time. 

AMBIENCE :-

Once you enter the Pub, the interior will blow you away. The unique point about PH7 is the view it provides through big windows of the hustle of Gachibowli and high-rise buildings around.

FOOD :-
I tried lot many things here ! Here is the detailed review of every item:

MOCKTAILS:
-
PH7 SPECIAL MOCKTAIL- It literally holds the name! The most colorful mojito I ever came across, the flavors will tickle your taste buds!

BLUE SEA- Its so beautiful to look at! The blue shade darkens down the bottom which contains lichi cream and makes this mojito special for sweet lovers.

PINA COLADA- You might wanna skip this one, I found the flavors not complementing each other.

FLAVORED VIRGIN MOJITO - Most recommended drink for hot summers. Refreshing, minty and relaxing !

SALADS:-


{'food': ['pizza', 'pizza'], 'drink': []}

Model should have labeled ‘SICILIAN’ and ‘cheese’ as food as well.


In [21]:
# Extract NER tags for Review no. 4001
Extract_NER(4001, model, verbose_log=True)

Original Review:
Services jayanta are wonderful. Starters are very good to start with mango flavour prawns adds classic mango taste. Wish pizza is also good for different pizza lovers. Overall a fantastic experience.


Filtered Review:
services jayanta wonderful starters good start mango flavour prawns adds classic mango taste wish pizza also good different pizza lovers overall fantastic experience


found food: mango
found food: mango
found food: pizza
found food: pizza



{'food': ['mango', 'mango', 'pizza', 'pizza'], 'drink': []}

Model should have labeled ‘prawns’ as a food item as well.


In [22]:
# Extract NER tags for Review no. 6777
Extract_NER(6777, model, verbose_log=True)

Original Review:
We orderd banjara kebab which was extremely soft, juicy and outstandingly tasty.full marks to this starter that melts in the mouth.This was followed by Zafrani murg Biryani which tasted good but not out of the way tasty.Both dishes were moderately spiced and i am happy about it. Great value for money and good service.


Filtered Review:
orderd banjara kebab extremely soft juicy outstandingly tasty.full marks starter melts mouth.this followed zafrani murg biryani tasted good way tasty.both dishes moderately spiced happy great value money good service





{'food': [], 'drink': []}

Model should have labeled ‘banjara kebab’ as a food item.
