In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import enchant
import re
from nltk.tokenize import  word_tokenize

In [2]:
raw_data = pd.read_csv('nearby_crf.csv')
len(raw_data)

143354

In [3]:
lemmatizer = WordNetLemmatizer()

entity_dictionary_unigram = ['atm','medical','store', 'petrol','pump', 'beauty', 'salon', 'bus','stop','pump'
                            'club', 'railway', 'station', 'gym', 'bank', 'post', 'office', 'airport'
                            'restaurant', 'movie', 'theatre', 'hotel', 'college', 'cinema', 'parlour'
                            'zoo', 'pub', 'bar', 'school','night']
entity_dictionary_unigram_lemmas = [lemmatizer.lemmatize(word) for word in entity_dictionary_unigram]

In [4]:
entity_dictionary_bigram = ['petrol pump', 'railway station', 'beauty parlour', 'medical store', 'post office'
                           'night club', 'bus stop', 'beauty salon','chemist']
entity_dictionary_bigram_lemmas = [lemmatizer.lemmatize(word) for word in entity_dictionary_bigram]

In [5]:
def dictionary_replacement(body, entity_dictionary_unigram, entity_dictionary_bigram):
    lemmatizer = WordNetLemmatizer()
    body['lemmas'] = body['body'].apply(lambda body : " ".join([lemmatizer.lemmatize(tokens) for tokens in word_tokenize(str(body).lower())]))
    x =list(body['lemmas'])
    for i in range(len(x)):
        for word in entity_dictionary_unigram:
            if word in x[i]:
                x[i] = x[i].replace(word, '_'+word+'_')
                
    return x     
        
        
        
                                    
                                
    

In [6]:
raw_data['tagged'] = dictionary_replacement(raw_data, entity_dictionary_unigram_lemmas, entity_dictionary_bigram_lemmas)

In [7]:
def entity_tagger(tagged_data):
    x = list(tagged_data['tagged'])
    tagged_list = []
    
    for sentence in x:
        tagged_words = []
        for tokens in word_tokenize(sentence):
            if( "_" in tokens):
                tagged_words.append((tokens.replace('_',''), 'N'))
            else:
                tagged_words.append((tokens, 'I'))
        tagged_list.append(tagged_words)

    return tagged_list
            

In [8]:
docs = entity_tagger(raw_data)

In [9]:
import nltk
def post_tag(docs):
    data = []
    for i, doc in enumerate(docs):

        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        # Perform POS tagging
        tagged = nltk.pos_tag(tokens)

        # Take the word, POS tag, and its label
        data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
    return data

In [10]:
data = post_tag(docs)

In [11]:
def is_nearest(x):
    if ('near' in x):
        return True
    return False

def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]
    
    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1,
            '-1:is_near=' + str(bool('near' in word1)),
            '-1:is_close=' + str(bool('close' in word1))
            
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [12]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [13]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 1,

    # coefficient for L2 penalty
    'c2': 1,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf2.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 29955
Seconds required: 2.327

L-BFGS optimization
c1: 100.000000
c2: 100.000000
num_memories: 6
max_iterations: 1000
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 388203.220351
Feature norm: 1.000000
Error norm: 357649.163578
Active features: 1103
Line search trials: 1
Line search step: 0.000000
Seconds required for this iteration: 0.895

***** Iteration #2 *****
Loss: 353189.107330
Feature norm: 0.907354
Error norm: 331397.582344
Active features: 893
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.497

***** Iteration #3 *****
Loss: 332256.345722
Feature norm: 0.556953
Error norm: 816647.154684
Active features: 710
Line search trials: 2
Line search step: 0.500000
Seconds requir

***** Iteration #63 *****
Loss: 34520.711445
Feature norm: 8.552606
Error norm: 865.749326
Active features: 309
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.442

***** Iteration #64 *****
Loss: 34518.656097
Feature norm: 8.560668
Error norm: 639.730612
Active features: 307
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.441

***** Iteration #65 *****
Loss: 34517.348564
Feature norm: 8.562562
Error norm: 694.189380
Active features: 303
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.442

***** Iteration #66 *****
Loss: 34515.850349
Feature norm: 8.569846
Error norm: 605.071407
Active features: 303
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.440

***** Iteration #67 *****
Loss: 34514.504866
Feature norm: 8.571224
Error norm: 574.593903
Active features: 303
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #144 *****
Loss: 34468.968828
Feature norm: 8.559473
Error norm: 342.576750
Active features: 298
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.904

***** Iteration #145 *****
Loss: 34468.554249
Feature norm: 8.559022
Error norm: 169.488704
Active features: 298
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.901

***** Iteration #146 *****
Loss: 34468.529122
Feature norm: 8.559553
Error norm: 369.137483
Active features: 298
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.917

***** Iteration #147 *****
Loss: 34468.124229
Feature norm: 8.558828
Error norm: 184.305896
Active features: 298
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.948

***** Iteration #148 *****
Loss: 34468.066074
Feature norm: 8.559466
Error norm: 358.111077
Active features: 298
Line search trials: 2
Line search step: 0.500000
Seconds requir

In [14]:
tagger = pycrfsuite.Tagger()
tagger.open('crf2.model')

y_pred = [tagger.tag(xseq) for xseq in X_test]


In [15]:
i=5
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))



more (I)
option (I)
{ (I)
attempt (I)
: (I)
2 (I)
, (I)
apiname (N)
: (I)
nearbyapi (N)
, (I)
placestype (N)
: (I)
atm (N)
} (I)


In [16]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))


             precision    recall  f1-score   support

          I       1.00      1.00      1.00    330083
          N       0.99      0.96      0.98     18634

avg / total       1.00      1.00      1.00    348717



In [17]:
test =pd.DataFrame()
test['body'] = ['the nearest wine shop','where is the nearest atm bro']
#print(test['body'])
test['tagged'] = dictionary_replacement(test,entity_dictionary_unigram_lemmas,entity_dictionary_bigram_lemmas)
ps = post_tag(entity_tagger(test))
X = [extract_features(doc) for doc in ps]
y = [get_labels(doc) for doc in ps]

In [18]:
y_pred = [tagger.tag(xseq) for xseq in X]

In [21]:
i=0

for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X[i]]):
    print("%s (%s)" % (y, x))



the (I)
nearest (I)
wine (I)
shop (I)


In [20]:
'near' in 'nearest'

True