Removing Named Entities

Importing Corpus From Directory

In [1]:
import pandas as pd
import os, glob
folder = "C:\\Users\\ASUS\\AILA Practice\\casedocs_idf"
os.chdir(folder)
files = glob.glob("*.txt") # Makes a list of all files in folder
corpus = []
corpus_dict = {}
for file1 in files:
    with open (file1, 'r') as f:
        document = f.read() # Reads document content into a string
        corpus.append(document)
        corpus_dict[file1] = document

In [2]:
new_corpus = corpus.copy()

In [3]:
del corpus[0]

Identify named entities using spaCy library and remove them from corpus

In [4]:
def remove_multiple_strings(cur_string, replace_list):
  for cur_word in replace_list:
    cur_string = cur_string.replace(cur_word, '', 1)
  return cur_string.lower()

import spacy
nlp = spacy.load('en_core_web_md')

for doc in new_corpus:
    i = new_corpus.index(doc)
    processed_doc = nlp(doc)
    entities = []
    for ent in processed_doc.ents:
        entities.append(ent.text)
    new_corpus[i] = remove_multiple_strings(doc, entities)

Split the corpus into tokens and remove puntuations and stop words using spaCy

In [5]:
tokenized_corpus = []
for text in new_corpus:
    doc2 = nlp(text)
    tokens_in_one_corpus = []
    for token in doc2:
        if not token.is_punct | token.is_space | token.is_stop :
            tokens_in_one_corpus.append(token.orth_ )
    tokenized_corpus.append(tokens_in_one_corpus)

In [6]:
tokenized_corpus[0]

['appellant',
 'detained',
 'managing',
 'director',
 'company',
 'registered',
 'incorporated',
 'companyc1',
 'exporter',
 'held',
 'valid',
 'licence',
 'therefor',
 'company',
 'export',
 'products',
 'alloy',
 'steel',
 'exporting',
 'alloy',
 'steel',
 'entitled',
 'credits',
 'scheme',
 'introduced',
 'object',
 'encouraging',
 'exports',
 'allegedly',
 'value',
 'description',
 'goods',
 'procuring',
 'fake',
 'false',
 'bills',
 'said',
 'said',
 'operating',
 'firms',
 'companyc2',
 'companyc3',
 'companyc4',
 'allegedly',
 'found',
 'non',
 'alloy',
 'steel',
 'bars',
 'rods',
 'etc',
 'value',
 'ranging',
 '15/-',
 '17/-',
 'kg',
 'exported',
 'guise',
 'alloy',
 'steel',
 'forgings',
 'bars',
 'rods',
 'etc',
 'declaring',
 'value',
 'thereof',
 '110/-',
 'kg',
 'export',
 'proceeds',
 'actual',
 'price',
 'routed',
 'officers',
 'dri',
 'searched',
 'factory',
 'residential',
 'premises',
 'appellant',
 'incriminating',
 'documents',
 'recovered',
 'appellant',
 'said',
 

------GENSIM GITHUB CODE START------

In [7]:
import logging
import math
from six import iteritems
from six.moves import range
from functools import partial
from multiprocessing import Pool

PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25

logger = logging.getLogger(__name__)


class BM25(object):
    """Implementation of Best Matching 25 ranking function.
    Attributes
    ----------
    corpus_size : int
        Size of corpus (number of documents).
    avgdl : float
        Average length of document in `corpus`.
    doc_freqs : list of dicts of int
        Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
    idf : dict
        Dictionary with inversed documents frequencies for whole `corpus`. Words used as keys and frequencies as values.
    doc_len : list of int
        List of document lengths.
    """

    def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
        """
        Parameters
        ----------
        corpus : list of list of str
            Given corpus.
        k1 : float
            Constant used for influencing the term frequency saturation. After saturation is reached, additional
            presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
            that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
            the type of documents or queries.
        b : float
            Constant used for influencing the effects of different document lengths relative to average document length.
            When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
            [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
            depends on factors such as the type of documents or queries.
        epsilon : float
            Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
            negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
            score (with 'very common' meaning that it is present in more than half of the documents). That can be
            undesirable as it means that an identical document would score less than an almost identical one (by
            removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
            different documents) to receive an extra score.
        """

        self.k1 = k1
        self.b = b
        self.epsilon = epsilon

        self.corpus_size = 0
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        self._initialize(corpus)

    def _initialize(self, corpus):
        """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
        nd = {}  # word -> number of documents with word
        num_doc = 0
        for document in corpus:
            self.corpus_size += 1
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in nd:
                    nd[word] = 0
                nd[word] += 1

        self.avgdl = float(num_doc) / self.corpus_size
        # collect idf sum to calculate an average idf for epsilon value
        idf_sum = 0
        # collect words with negative idf to set them a special epsilon value.
        # idf can be negative if word is contained in more than half of documents
        negative_idfs = []
        for word, freq in iteritems(nd):
            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = float(idf_sum) / len(self.idf)

        if self.average_idf < 0:
            logger.warning(
                'Average inverse document frequency is less than zero. Your corpus of {} documents'
                ' is either too small or it does not originate from natural text. BM25 may produce'
                ' unintuitive results.'.format(self.corpus_size)
            )

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

    def get_score(self, document, index):
        """Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
        Parameters
        ----------
        document : list of str
            Document to be scored.
        index : int
            Index of document in corpus selected to score with `document`.
        Returns
        -------
        float
            BM25 score.
        """
        score = 0.0
        doc_freqs = self.doc_freqs[index]
        numerator_constant = self.k1 + 1
        denominator_constant = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl)
        for word in document:
            if word in doc_freqs:
                df = self.doc_freqs[index][word]
                idf = self.idf[word]
                score += (idf * df * numerator_constant) / (df + denominator_constant)
        return score

    def get_scores(self, document):
        """Computes and returns BM25 scores of given `document` in relation to
        every item in corpus.
        Parameters
        ----------
        document : list of str
            Document to be scored.
        Returns
        -------
        list of float
            BM25 scores.
        """
        scores = [self.get_score(document, index) for index in range(self.corpus_size)]
        return scores

    

---------GENSIM GITHUB CODE END--------

Applying gensim BM25 to the corpus to obtain scores

In [8]:
list = []
i=1
for i in range(len(corpus)):
    list.append(tokenized_corpus[i+1]) 
p1 = BM25(list)

In [9]:
query = tokenized_corpus[0]
resultant_scores = p1.get_scores(query)
resultant_scores

[86.07006747305974,
 142.4776971150876,
 182.02682049327385,
 121.14036507210946,
 79.22396098391599,
 92.20639167325,
 123.89941059582694,
 116.78680785519305,
 114.85388519627251,
 120.98354765663551,
 134.81437286116997,
 79.55794426146704,
 125.19935023857747,
 233.66608488209977,
 75.09752103115817,
 87.13961257379286,
 155.51655753842175,
 140.16069849947405,
 156.06387935045126,
 102.5598344376015,
 116.74509029415792,
 77.1339004580865,
 85.48698923764408,
 132.49267835924664,
 150.3318540489649,
 179.637623326596,
 142.9186194081685,
 147.79178679887224,
 118.16837435701402,
 145.4119671500149,
 36.44348260575132,
 173.8565370813644,
 108.13431399652387,
 81.2302885833642,
 79.25618631101989,
 167.04897264633652,
 96.17336113903099,
 117.68081931975989,
 188.11207005286994,
 125.3322984763336,
 123.12518073315897,
 120.69637349415547,
 124.02523553570207,
 112.03161621598659,
 137.0510539156831,
 112.9850348773116,
 117.93404330390783,
 107.37645072939132,
 109.78256604118323,

SORTING RESULTANT SCORES IN DESCENDING ORDER

In [10]:
import numpy as np
res_list = np.array(resultant_scores)
res_list

array([ 86.07006747, 142.47769712, 182.02682049, ...,  64.79649369,
       141.76232867,  93.0438769 ])

In [11]:
top_n = np.argsort(res_list)[::-1]
result = [corpus[i] for i in top_n]

In [12]:
scores = res_list[top_n]
scores

array([306.04869137, 292.22700471, 290.13547883, ...,  21.37731277,
        16.10440955,  15.84622789])

FINDING FILENAME CORRESPONDING TO SCORES

In [13]:
output = []

for item in result:
    for filename, content in corpus_dict.items():
        if content == item:
            output.append(filename)

In [14]:
new_output = []
sstring = '.txt'
for item in output:
    if item.endswith(sstring): 
        new_output.append(item[:-(len(sstring))])

In [15]:
new_output

['C1824',
 'C2017',
 'C622',
 'C91',
 'C730',
 'C73',
 'C375',
 'C2172',
 'C177',
 'C1655',
 'C2211',
 'C418',
 'C2164',
 'C405',
 'C1490',
 'C1734',
 'C2730',
 'C2241',
 'C913',
 'C2811',
 'C175',
 'C1642',
 'C2506',
 'C1042',
 'C2158',
 'C199',
 'C366',
 'C1251',
 'C1138',
 'C101',
 'C136',
 'C161',
 'C2852',
 'C2667',
 'C2131',
 'C272',
 'C1547',
 'C732',
 'C372',
 'C1486',
 'C2787',
 'C150',
 'C1940',
 'C2463',
 'C1531',
 'C1908',
 'C2328',
 'C2666',
 'C759',
 'C996',
 'C1483',
 'C2351',
 'C2426',
 'C2374',
 'C128',
 'C1457',
 'C1947',
 'C1478',
 'C1916',
 'C1479',
 'C1354',
 'C360',
 'C2468',
 'C1050',
 'C909',
 'C162',
 'C2742',
 'C2700',
 'C24',
 'C1658',
 'C648',
 'C14',
 'C105',
 'C2257',
 'C457',
 'C1202',
 'C1853',
 'C140',
 'C1787',
 'C1858',
 'C2266',
 'C1521',
 'C23',
 'C2818',
 'C374',
 'C2897',
 'C1887',
 'C2414',
 'C1300',
 'C2',
 'C316',
 'C737',
 'C2366',
 'C2544',
 'C2610',
 'C2717',
 'C174',
 'C2593',
 'C1772',
 'C671',
 'C798',
 'C1084',
 'C2712',
 'C1206',
 'C21'

WRITING TO A FILE FOR TREC-EVAL

In [16]:
f = open("C:\\Users\\ASUS\\AILA Practice\\run-NER-bm25.txt","a")
for i in range(len(corpus)):
    f.write("AILA_Q27 Q0 {} {} {} Default\n".format(new_output[i], (i+1), scores[i]))
f.close()

In [17]:
len(new_output)

2914