In [None]:
!pip install python-crfsuite

Collecting python-crfsuite
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.10


In [None]:
import pandas as pd
import numpy as np

# NLP imports
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import pycrfsuite
from sklearn.metrics import classification_report

# misc imports
from ast import literal_eval
import glob

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Provide Base path to Yelp Dataset
dataset_root = '/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/NER/Yelp/'

# CRF (Conditional Random Fields)
References:


1.   https://python-crfsuite.readthedocs.io/en/latest/
2.   https://medium.com/ml2vec/overview-of-conditional-random-fields-68a2a20fa541
3. https://towardsdatascience.com/conditional-random-field-tutorial-in-pytorch-ca0d04499463
4. https://dev.to/fferegrino/conditional-random-fields-in-python-sequence-labelling-part-4-5ei2

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Data Pre processing

In [None]:
# Get all NER datasets in root directory
file_list = glob.glob(dataset_root + 'yelp_NER_sample_*.csv')

dfs = []
for file in file_list:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
crf_df = pd.concat(dfs, ignore_index=True)

# Drop reviews with missing annotations
crf_df = crf_df.dropna(subset=['ner_results'])


In [None]:
# Function to convert text into (word, pos_tag, ner_label) tuples
def convert_text_to_tuples(text, ner_results):

    # print(f"review: {text}")
    # print(f"NER: {ner_results}")

    # lowercasing text
    text = text.lower()
    # Tokenize the text
    words = word_tokenize(text, language='english')
    # removing punctuation
    words = [word for word in words if word not in string.punctuation]

    # stop word removal
    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words]


    # Get POS tags for the words
    pos_tags = pos_tag(words)

    # Initialize list to store (word, pos_tag, ner_label) tuples
    word_tuples = []

    # Convert ner_results from string to dictionary
    ner_results_dict = literal_eval(ner_results)
    if type(ner_results_dict) != dict:
        print(f"caught unexpected NER type: {ner_results}")
        return word_tuples

    # Check only ('food', 'drink', 'None') exist in NER lables
    valid_NER_Lables = ['food', 'drink', 'None']
    for lable in ner_results_dict.keys():
        if lable not in valid_NER_Lables:
            # print(ner_results_dict)
            return word_tuples

    # Iterate through words and POS tags
    i = 0
    while i < len(words):
        # Check if the word is a named entity (food/drink item)
        ner_label = None
        for label, entities in ner_results_dict.items():
            # catch unexpected NER entities
            try:
                # lower case valid food/drink terms for word match
                entities_lower = [item.lower() for item in entities]
            except:
                print(f"caught unexpected NER: {ner_results}")
                return word_tuples

            # loop through each valid food/drink item
            for food_drink_item in entities:
                # check if given word is part of food/drink item
                possible_food_drink = ' '.join(words[i:i+len(food_drink_item.split())]).lower()
                # print(f"possible food/drink: {possible_food_drink}")
                if possible_food_drink in entities_lower:
                    # print(f"found a match {possible_food_drink}")
                    match_food_drink = possible_food_drink.split()
                    ner_label = label
                    # Add (word, pos_tag, ner_label) tuple for each word in the named entity
                    for j in range(len(match_food_drink)):
                        word_tuples.append((words[i+j], pos_tags[i+j][1], ner_label))
                    i += len(match_food_drink) - 1  # Skip the next words as they are part of the named entity
                    break
        if ner_label is None:
            # If not a named entity, add (word, pos_tag, None) tuple
            word_tuples.append((words[i], pos_tags[i][1], 'None'))
        i += 1
    return word_tuples


In [None]:
# Apply the function to each review
crf_df['word_tuples'] = crf_df.apply(lambda row: convert_text_to_tuples(row['text'], row['ner_results']), axis=1)

# remove empty word tuples
crf_df = crf_df[crf_df['word_tuples'].apply(lambda x: len(x) != 0)]

caught unexpected NER type: [{'type': 'food', 'name': 'Mac n cheese'}, {'type': 'drink', 'name': 'beer'}]
caught unexpected NER: {'food': [{'item': 'Tuna Tartare', 'rating': 4.5}, {'item': 'Spinach and Asparagus', 'rating': 4.5}, {'item': 'Chilean Sea Bass', 'rating': 4.5}, {'item': 'Crab Cake', 'rating': 4.5}], 'drink': []}
caught unexpected NER: {'food': [{'name': 'Lynchburg Basil Lemonade', 'description': 'Jack Daniels, Fresh Basil, Homemade Southern Lemonade'}, {'name': 'Black Cherry Gimlet', 'description': 'Black Cherry Vodka, Vanilla, Lime, maraschino Cherry'}], 'drink': [{'name': 'Lynchburg Basil Lemonade', 'description': 'Jack Daniels, Fresh Basil, Homemade Southern Lemonade'}, {'name': 'Black Cherry Gimlet', 'description': 'Black Cherry Vodka, Vanilla, Lime, maraschino Cherry'}]}
caught unexpected NER: {'food': [{'name': 'Foie Gras Soup', 'description': 'Perfect combination of richness and spice, and just awesome.', 'rating': 5}, {'name': 'Pork Belly in Chili Oil', 'descriptio

In [None]:
# split into train/test sets
crf_df_train, crf_df_test = train_test_split(crf_df, test_size=0.2, random_state=42, shuffle=True)
print(f"Total dataset size: {len(crf_df)}")
print(f"Train set size: {len(crf_df_train)}")
print(f"Test set size: {len(crf_df_test)}")

Total dataset size: 18748
Train set size: 14998
Test set size: 3750


### Extract features

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

#################################################################

import string
punctuation = set(string.punctuation)

def is_punctuation(token):
    return token in punctuation

def is_numeric(token):
    try:
        float(token.replace(",", ""))
        return True
    except:
        return False

def word2features_1(sentence_frame, i):
    token = sentence_frame[i][0]
    postag = sentence_frame[i][1]

    # current_token = sentence_frame.iloc[current_idx]
    # token = current_token['token']
    #position = current_token['position']
    #token_count = current_token['token_count']
    #pos = current_token['pos_tag']

    # Shared features across tokens
    features = {
            'bias': True,
            'word.lower': token.lower(),
            'word.istitle': token.istitle(),
            'word.isdigit': is_numeric(token),
            'word.ispunct': is_punctuation(token)
           # 'word.position':position,
          #  'word.token_count': token_count,
          # 'postag': pos,
    }

    if i > 0: # The word is not the first one...
        #prev_token = sentence_frame.iloc[current_idx-1]['token']
        # prev_pos = sentence_frame.iloc[current_idx-1]['pos_tag']
        prev_token = sentence_frame[i-1][0]
        prev_pos = sentence_frame[i-1][1]
        features.update({
            '-1:word.lower': prev_token.lower(),
            '-1:word.istitle':prev_token.istitle(),
            '-1:word.isdigit': is_numeric(prev_token),
            '-1:word.ispunct': is_punctuation(prev_token),
            '-1:postag':prev_pos
        })
    else:
        features['BOS'] = True

    if i < len(sentence_frame) - 1: # The word is not the last one...
        #next_token = sentence_frame.iloc[current_idx+1]['token']
        #next_tag = sentence_frame.iloc[current_idx+1]['pos_tag']
        next_token = sentence_frame[i+1][0]
        next_tag = sentence_frame[i+1][1]
        features.update({
            '+1:word.lower': next_token.lower(),
            '+1:word.istitle': next_token.istitle(),
            '+1:word.isdigit': is_numeric(next_token),
            '+1:word.ispunct': is_punctuation(next_token),
            '+1:postag': next_tag
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
%%time
# Extract features from reviews
# X_train = crf_df_train.apply(lambda row: sent2features(row['word_tuples']), axis=1)
# y_train = crf_df_train.apply(lambda row: sent2labels(row['word_tuples']), axis=1)
X_train = [sent2features(s) for s in list(crf_df_train['word_tuples'])]
y_train = [sent2labels(s) for s in list(crf_df_train['word_tuples'])]

X_test = [sent2features(s) for s in list(crf_df_test['word_tuples'])]
y_test = [sent2labels(s) for s in list(crf_df_test['word_tuples'])]



CPU times: user 4.74 s, sys: 1.82 s, total: 6.56 s
Wall time: 6.69 s


### Train Model

In [None]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 200,

    'feature.possible_transitions': True
})

CPU times: user 210 µs, sys: 0 ns, total: 210 µs
Wall time: 280 µs


In [None]:
# We are feeding our training set to the algorithm here.
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [None]:
%%time
trainer.train(dataset_root + '../Model/crf_train_model_ner.crfsuite')

CPU times: user 2min 25s, sys: 819 ms, total: 2min 26s
Wall time: 2min 27s


In [None]:
!ls '/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/NER/Yelp/../Model/crf_train_model_ner.crfsuite'

'/content/drive/MyDrive/UMich Milestone II Project/Final_Code_Submission/Dataset/NER/Yelp/../Model/crf_train_model_ner.crfsuite'


### Inference

In [None]:
crf_tagger = pycrfsuite.Tagger()
crf_tagger.open(dataset_root + '../Model/crf_train_model_ner.crfsuite')

<contextlib.closing at 0x78626f5b8d60>

In [None]:
'''
for i in range(len(X_test)):
  predicted_tags = crf_tagger.tag(X_test[i])
  if predicted_tags == y_test[i]:
    print(i)
    '''

'\nfor i in range(len(X_test)):\n  predicted_tags = crf_tagger.tag(X_test[i])\n  if predicted_tags == y_test[i]:\n    print(i)\n    '

In [None]:
# Examine review 574
REV_IDX = 574
print(f"Review text: {crf_df_test.iloc[REV_IDX].text} \n")
print(f"Extected NER label: {crf_df_test.iloc[REV_IDX].ner_results}")

response = crf_tagger.tag(X_test[REV_IDX])
#crf_df_test.iloc[REV_IDX].text.split()[res.index('food')]
#X_test[REV_IDX][res.index('food')]


for i, tag in enumerate(response):
    if tag == 'food':
        print(f"found food item: {X_test[REV_IDX][i]['word.lower()']}")
    if tag == 'drink':
        print(f"found drink item: {X_test[REV_IDX][i]['word.lower()']}")


Review text: - TIP:  Their black garlic wings are some of the best wings in the city.

- CON:  Recent happy hour addition was downright disappointing.  Quality of dishes wasn't up to Cheu's standard.  Service was curt.  

- BOTTOM LINE:  It pains me to give this place 3 stars.  A year ago, I emphetically would proclaim it to be my favorite restaurant in the city.  But they've lost their way.  Hand torn noodles are gone from the menu.  Service has been shoddy (it used to be extremely welcoming).  Food quality has been inconsistent.  It's a shame, because I want to keep on loving this place. 

Extected NER label: {'food': ['black garlic wings'], 'drink': []}
found food item: black
found food item: garlic
found food item: wings
found food item: wings


### Evaluation

In [None]:
all_true, all_pred = [], []

for i in range(len(X_test)):
    all_true.extend(y_test[i])
    all_pred.extend(crf_tagger.tag(X_test[i]))

print(classification_report(all_true, all_pred))

              precision    recall  f1-score   support

        None       0.96      0.98      0.97    203007
       drink       0.57      0.31      0.40      2404
        food       0.65      0.47      0.54     13746

    accuracy                           0.94    219157
   macro avg       0.73      0.59      0.64    219157
weighted avg       0.93      0.94      0.94    219157

