# NER Tagging for FAQ Model

Part of speech (POS) and Inside, outside, beginning of chunk (IOB) tagging of FAQ data

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

# Loading data

Importing necessary modules

In [None]:
import pandas as pd

Reading data

In [None]:
faq_data = pd.read_excel("faq_train_updated.xlsx")
faq_data = faq_data.drop("Remarks", axis = 1)

# Defining tagging functions

Importing necessary modules

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk import ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download("averaged_perceptron_tagger")

Defining truecase function

In [None]:
def truecase(sent):
    sent = nltk.word_tokenize(sent)
    sent_pt = nltk.pos_tag(sent)
    for i in range(len(sent)):
        if(sent_pt[i][1] in ['NN','NNP','NNS', 'NNPS']):
            sent[i] = sent[i].capitalize()
    return sent

Function to find POS Tag

In [None]:
def pos_tagger(sent):
    token = nltk.word_tokenize(sent)
    pos_tag = nltk.pos_tag(token)
    return pos_tag

Function to find IOB Tag

In [None]:
def iob_tagger(sent):
    sent = truecase(sent)
    tree = ne_chunk(nltk.pos_tag(sent))
    iob_tag = tree2conlltags(tree)
    return iob_tag

# Tagging data

List to store Data frames

In [None]:
df = []

Creating Data frames for each sentence

In [None]:
for i in range(len(faq_data["Question Title"])):
    df.append(pd.DataFrame())
    df[i]["Question Title"] = nltk.word_tokenize(faq_data["Question Title"].at[i])
    df[i]["POS Tag"] = [pos_tagger(faq_data["Question Title"].at[i])[j][1] for j in range(len(df[i]["Question Title"]))]
    df[i]["IOB Tag"] = [iob_tagger(faq_data["Question Title"].at[i])[j][2] for j in range(len(df[i]["Question Title"]))]
    df[i].insert(0,"Sentence","")
    df[i]["Sentence"][0] = "Sentence: {}".format(i+1)

Concatenating all data frames

In [None]:
faq_df = pd.concat(df)

Exporting data frame as excel sheet

In [None]:
faq_df.to_excel("faq_ner2.xlsx")

# Analyzing tags

Finding POS tag distribution

In [None]:
faq_df.groupby('POS Tag').size().reset_index(name='counts')

Finding IOB tags distribution

In [None]:
faq_df.groupby('IOB Tag').size().reset_index(name='counts')