## **Import packages**

In [0]:
import csv 
import requests 
import xml.etree.ElementTree as ET 
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.text import Text

In [0]:
# If executing in Google Colab
from google.colab import drive
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## **Import Data**

In [0]:
temp = pd.read_pickle("/content/drive/My Drive/Text_Summarization_Code/Data/data.pkl")
temp.shape

(4946, 3)

In [0]:
((temp.iloc[1293])['Full'])

'5. from the judgment and order dated 11.8.1987 of the punjab and haryana high court in civil revision no. 1048 of 1986, a.k gupta for the appellants. vikram mahajan, gopi chand and k.k. gupta for the respondent. the judgment of the court was delivered by verma, j. the suit premises in chandigarh was let out by the appellant to the respondent, smt. satya bhalla on 1.11.1974 on a monthly rent of rs. 550 solely for residential purpose. however, the respondent\'s husband, a lawyer established his office in a part of the suit premises and started using the same for that purpose. the appellant-landlord filed a petition before the rent controller in february, 1983 seeking eviction of the respondent-tenant on several grounds including the ground contained in section 13(2) (ii) (b) of the east punjab urban rent restriction act, 1949 i.e. the use of the building for a purpose other than that for which it was leased. the rent controller made an order of eviction of the respondent-tenant on the g

In [0]:
temp["col_len"] = temp.apply(lambda row:len(row.Full),axis = 1,result_type = "expand")
temp.drop(["Doc"],axis = 1,inplace = True)

In [0]:
temp

Unnamed: 0,Summary,Full,col_len
0,proceedings under section 145(1) of the crimi...,civil appeal nos. 587-696 & 598-600 of 1976. ...,6931
1,when a tenant has neither paid nor tendered t...,civil appeal no. 966 of 1976. (appeal by spec...,5106
2,by a writ petition under article 226 of the c...,civil appeal no. 1654 of1967. appeal from the...,21719
3,the appellant is the mahant of emhar math of ...,civil appeal no. 1 770 of 1972. appeal by spe...,16939
4,the respondent assessee maintains accounts re...,civil appeal nos. 894-896 of 1971. from the j...,13293
...,...,...,...
4941,the appellant and the respondents applied for...,civil appeal nos. 16 16-17 of 1990. from the ...,34289
4942,"under an agreement dated december 8, 1933, th...",civil appeal no. 183 of 1956. appeal from the...,18683
4943,in assessment proceedings for the year 1949-5...,"civil appeal nos. 736 to 739, 91-3 and 1621 o...",5957
4944,"two persons, b and c, formed a partnership fi...",civil appeals nos. 317 to 320 of 1957. appeal...,28086


## **Text Cleaning and Vocab building**

### **Run the code in sequence below**


### **1.Install Prerequisites**

In [0]:
## pip install spacy
## other prerequisites to be added
import spacy
from spacy.lang.en import English
nlp = English()
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 21787990
# nlp.add_pipe(nlp.create_pipe('sentencizer'))

nltk.download('stopwords')
nltk.download('punkt')
is_remove_stopwords = True
if is_remove_stopwords:
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### **2.Run the functions and declarations below**


In [0]:
ENTITY_ENUM = {
    '': '',
    'PERSON': 'person',
    'NORP': 'nationality',
    'FAC': 'facility',
    'ORG': 'organization',
    'GPE': 'country',
    'LOC': 'location',
    'PRODUCT': 'product',
    'EVENT': 'event',
    'WORK_OF_ART': 'artwork',
    'LANGUAGE': 'language',
    'DATE': 'date',
    'TIME': 'time',
    'PERCENT': 'number',
    'MONEY': 'number',
    'QUANTITY': 'number',
    'ORDINAL': 'number',
    'CARDINAL': 'number',
    'LAW': 'law'
}

NUMERIC_TYPES = set([
    'DATE',
    'TIME',
    'PERCENT',
    'MONEY',
    'QUANTITY',
    'ORDINAL',
    'CARDINAL',
])

**Basic text cleaning**

In [0]:
import re

def clean_string(text):
    text = str(text)
  # Replace weird chars in text
    text = re.sub("’", "'", text) # special single quote
    text = re.sub("`", "'", text) # special single quote
    text = re.sub("“", '"', text) # special double quote
    text = re.sub("？", "?", text) 
    text = re.sub("…", " ", text) 
    text = re.sub("é", "e", text)
  
    # Clean shorthands 
    text = re.sub("\'s", " ", text) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
    text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
    text = re.sub(r"(\W|^)([0-9]+)[kK](\W|$)", r"\1\g<2>000\3", text) # better regex provided by @armamut
    text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE)
    text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)
    
    # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word "number"
    
    text = re.sub('[0-9]+\.[0-9]+', " 87 ", text)
    
    # remove comma between numbers, i.e. 15,000 -> 15000
    
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)

    # indian rupees
    text = re.sub("(?<=[0-9])rs ", " rs ", text, flags=re.IGNORECASE)
    text = re.sub(" rs(?=[0-9])", " rs ", text, flags=re.IGNORECASE)

    # the single 's' in this stage is 99% of not clean text, just kill it
    text = re.sub(' s ', " ", text)
    
    # reduce extra spaces into single spaces
    text = re.sub('[\s]+', " ", text)
    text = text.strip()

    return text


**Enity dictionary genetation using vote dictionary**

In [0]:
def vote_dictionary_generation(doc,vote_dict):

  # construct vote dictionary
  for token in doc:
    if token.lower_ not in vote_dict:
      vote_dict[token.lower_] = {}
    if token.ent_type_ not in vote_dict[token.lower_]:
      vote_dict[token.lower_][token.ent_type_] = 0
      # if the token has_vector is True, maybe we shouldn't record its 
      vote_dict[token.lower_][token.ent_type_] += 1 # TODO: not sure if storing in lowercase form is safe ?

def entity_lookup_generation(vote_dict,word_ent_type_dict,word_ent_type_second_dict):
  threshold_of_second_type = 3
  # vote for what should the type be   
  for key in vote_dict:
    # non-type has lower priority
    if '' in vote_dict[key]:
      vote_dict[key][''] = vote_dict[key][''] - 0.1

    ents = list(vote_dict[key].keys())
    bi_list = [
              ents,
              [vote_dict[key][ent] for ent in ents]
              ]
    # if several ent_type_ have same count, just let it go, making them share same ent_type_ is enough
    # TODO: if have time, can design a better metric to deal with second graded type
    sorted_idx = np.argsort(bi_list[1])
    if sorted_idx.shape[0]>1:
      best_idx = sorted_idx[-1]
      second_idx = sorted_idx[-2]
      word_ent_type_dict[key] = bi_list[0][best_idx]
      if bi_list[1][second_idx]>threshold_of_second_type:
        word_ent_type_second_dict[key] = bi_list[0][second_idx]
    else:
      best_idx = sorted_idx[-1]
      word_ent_type_dict[key] = bi_list[0][best_idx]

**Function to check type of token using Entity dictionary**

In [0]:
## This function is used in our case - Checks entity type of a token
def token_type_lookup(token, report_detail=False):
    
    if type(token)==str:
        token = nlp(token)[0]
        
    key = token.lower_
    
    try:
        if report_detail:
            print(ENTITY_ENUM[word_ent_type_dict[key]], ' <= ', {ENTITY_ENUM[ent_t] : vote_dict[key][ent_t] for ent_t in vote_dict[key]} )

        return word_ent_type_dict[key]
    
    except KeyError:
        return ''

def is_token_has_second_type(token):
    
    if type(token)==str:
        token = nlp(token)[0]
        
    key = token.lower_
    
    try:
        return key in word_ent_type_second_dict
    except KeyError:
        return False

def token_second_type_lookup(token, report_detail=False):
    
    if type(token)==str:
        token = nlp(token)[0]
        
    key = token.lower_
    
    try:
        if report_detail:
            print(ENTITY_ENUM[word_ent_type_second_dict[key]], ' <= ', {ENTITY_ENUM[ent_t] : vote_dict[key][ent_t] for ent_t in vote_dict[key]} )

        return word_ent_type_second_dict[key]
    except KeyError:
        return ''

**Advanced text cleaning**

In [0]:
exception_list =  set(['need']) # spaCy identifies need's lamma as 'ne', which is not we want
numeric_types = set(['DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'])

def process_text_with_spacy(spacy_obj, debug=False, show_fail=False, idx=None):

    def not_alpha_or_digit(token):
        ch = token.text[0]
        return not (ch.isalpha() or ch.isdigit())
    
    result_word_list = []
    res = ''
    
    # for continuous entity type string, we need only single term. 
    # EX: "2017-01-01"
    # => "time time time" (X)
    # => "time" (O)
    previous_ent_type = None
    
    is_a_word_parsed_fail = False
    fail_words = []
    
    for token in spacy_obj:
        
        global_ent_type = token_type_lookup(token)
        
        # problematic token, use its base form
        if token.text in exception_list:
            
            previous_ent_type = None
            result_word_list.append(token.text)
            
        # skip none words tokens
        elif not_alpha_or_digit(token) or token.text==' ' or token.text=='s':
            previous_ent_type = None
            if debug: print(token.text, ' : remove punc or special chars')
            
            
        # if the "remove stop word" flag is set to True
        elif is_remove_stopwords and token.lemma_ in stopwords:
            previous_ent_type = None
            if debug: print(token.text, ' : remove stop word')
        
        
        # contiguous same type, skip it
        elif global_ent_type==previous_ent_type or token.ent_type_==previous_ent_type:
            if debug: print('contiguous same type')
        elif global_ent_type in NUMERIC_TYPES and previous_ent_type in NUMERIC_TYPES:
            if debug: print('contiguous numeric')
        elif token.ent_type_ in NUMERIC_TYPES and previous_ent_type in NUMERIC_TYPES:
            if debug: print('contiguous numeric')
                
        
        # number without an ent_type_
        elif token.text.isdigit():
            
            if debug: print(token.text, 'force to be number')
                
            if previous_ent_type in NUMERIC_TYPES:
                pass
            else:
                previous_ent_type = 'CARDINAL' # any number type would be okay
                result_word_list.append('number')

    
        # replace proper nouns into name entities. 
        # EX:
        # Original : Taiwan is next to China
        # Result   : country is next to country 
        elif global_ent_type!='':
            
            result_word_list.append(ENTITY_ENUM[global_ent_type])
            previous_ent_type = global_ent_type
            if debug: print(token.text, ' : sub ent_type:', ENTITY_ENUM[global_ent_type])
            
            
        # Identify if a word is proper noun or not, if it is a proper noun, we'll try to use second highest rated ent_type_
        #
        # A proper noun has following special patterns:
        #     1. its lemma_ (base form) returned by spaCy is just its lowercase form
        #     2. if one of its character except the first character is uppercase, it is a propernoun (in most cases)
        # except the special cases like "I LOVE YOU", we cal say that if (1.) and (2.), then the token is proper noun
        #
        # for cases like "Tensorflow", we have no good rule to identify it is a proper noun or not ... let's just move on
        elif token.lower_==token.lemma_ and token.text[1:]!=token.lemma_[1:] and is_token_has_second_type(token):
            second_type = token_second_type_lookup(token)
            result_word_list.append(ENTITY_ENUM[second_type])
            if debug: print(token.text, ' : use second ent_type:', ENTITY_ENUM[second_type])
            previous_ent_type = second_type
        
        
        # words arrive here are either "extremely common" or "extremely rare and has no method to deal with"
        else:
            # A weird behavior of SpaCy, it substitutes [I, my, they] into '-PRON-'
            if token.lemma_=='-PRON-':
                result_word_list.append(token.lower_)
                res = token.lower
                previous_ent_type = None
            
            # the lemma can be identified by GloVe
            elif nlp(token.lemma_)[0].has_vector:
                result_word_list.append(token.lemma_)
                res = token.lemma_
                previous_ent_type = None
            
            # the lemma cannot be identified, very probably a proper noun
            elif is_token_has_second_type(token):
                second_type = token_second_type_lookup(token)
                result_word_list.append(ENTITY_ENUM[second_type])
                res = ENTITY_ENUM[second_type]
                previous_ent_type = second_type
                if debug: print(token.text, ' : use second ent_type in else :', ENTITY_ENUM[second_type])
            
            # the lemma is not in glove and Spacy can't identify if it is a proper noun, last try, 
            #      if the word itself can be identified by GloVe or not
            elif nlp(token.lower_)[0].has_vector:
                result_word_list.append(token.lower_)
                res = token.lower_
                previous_ent_type = None
                if debug: print(token.text, ' : the token itself can be identified :', token.lower_)
            elif token.has_vector:
                result_word_list.append(token.text)
                res = token.text
                previous_ent_type = None
                if debug: print(token.text, ' : the token itself can be identified :', token.text)
                
            # Damn, I have totally no idea what's going on
            # You got to deal with it by yourself
            # In my case, I use fasttext to deal with it
            else:
                is_a_word_parsed_fail = True
                fail_words.append(token.text)
                previous_ent_type = None
                
                if debug: print(token.text, ' : can\'t identify, replace with "something"')
                
    
    if show_fail and is_a_word_parsed_fail:
        if idx!=None:
            print('At qid=', idx)
        print('Fail words: ', fail_words)
        print('Before:', spacy_obj.text)
        print('After: ', ' '.join(result_word_list))
        print('====================================================================')
    
    return np.array(result_word_list)

### **3.Input data (with NER) for Feature Engineering**


Entity dictionary generation

In [0]:
len(temp)

4946

In [0]:
## Generates vote dictionary which is used for generating Entity dictionary
import json
import time

# STEP 2: fill batch number according to your convinience

batch_size = 30

# STEP 3: batch wise processing
# batch no to be filled inside the range value
# tabulate the batch number according to your allocation of document to be processed
for i in range(0, 10):
  vote_dict = {}
  word_ent_type_dict = {}
  word_ent_type_second_dict = {} 
  max_length = 0

  start = i * batch_size
  end = start + batch_size - 1
  start_time = time.time()
  print("Start Target: ", start)
  print("End Target  : ", end)

  for i,row in temp.iterrows():
    if(i>=start and i <= end):
      print("vote dict, doc no: ", i, "of length: ", row['col_len'])
      for col in temp.columns:
        doc = nlp(clean_string(row[col]).lower())
        vote_dictionary_generation(doc, vote_dict)
  
  entity_lookup_generation(vote_dict, word_ent_type_dict, word_ent_type_second_dict)

  # STEP 4: change the path accordingly
  # would suggest not to change the naming convention
  with open('/content/drive/My Drive/Output/Dictionaries/word_ent_type_dict_' + str(start) + '_to_' + str(end) +'.json', 'w') as fp1:
      json.dump(word_ent_type_dict, fp1)
  
  end_time = time.time()
  print("\nword_ent_type_dict Write Successful in time : ", (end_time - start_time)/60 , " min")

  # try:
  df = pd.DataFrame()

  start_time = time.time()
  for i,row in temp.iterrows():
    summary = []
    full = []
    ori_full = []

    # For Summary
    if(i>=start and i <= end):
        print("summary, doc no: ", i, "of length: ", len((row["Summary"])[:-110]))
        for sent in sent_tokenize(clean_string((row["Summary"])[:-110]).lower()): 
            doc = nlp(sent)
            s = process_text_with_spacy(doc) # Advanced Text Cleaning
            res = " ".join(s)
            summary.append(res)
            
    # For Full text
    if(i>=start and i <= end):
        print("full, doc no   : ", i, "of length: ", len((row["Full"])[56:-468]))
        for sent in sent_tokenize(clean_string((row["Full"])[56:-468]).lower()):
            doc = nlp(sent)
            s = process_text_with_spacy(doc)
            res = " ".join(s)
            full.append(res)
            ori_full.append(sent)
      
    sum_join = "*".join(summary)
    full_join = "*".join(full)
    ori_full_join = "*".join(ori_full)

    df_obj = pd.DataFrame({"summary":[sum_join], "full":[full_join], "full_orignal":[ori_full_join], "Id":[i]})
    df = pd.concat([df,df_obj])

  # STEP 5: change path to the destination folder accordingly
  # would suggest not to change the naming convention
  (df.iloc[start:end+1]).to_pickle('/content/drive/My Drive/Output/A_Output/dataframe_processed_' + str(start) + '_to_' + str(end) +'.pickle')

  end_time = time.time()
  print("\nDataframe Write to Disk Successful in time: ", (end_time - start_time)/60 , " min")

Start Target:  3300
End Target  :  3329
vote dict, doc no:  3300 of length:  1054530
vote dict, doc no:  3301 of length:  205354
vote dict, doc no:  3302 of length:  56659
vote dict, doc no:  3303 of length:  938904
vote dict, doc no:  3304 of length:  47921
vote dict, doc no:  3305 of length:  234880
vote dict, doc no:  3306 of length:  379051
vote dict, doc no:  3307 of length:  1381096
vote dict, doc no:  3308 of length:  450390
vote dict, doc no:  3309 of length:  107397
vote dict, doc no:  3310 of length:  362229
vote dict, doc no:  3311 of length:  149925
vote dict, doc no:  3312 of length:  339443
vote dict, doc no:  3313 of length:  110654
vote dict, doc no:  3314 of length:  101069
vote dict, doc no:  3315 of length:  386371
vote dict, doc no:  3316 of length:  496528
vote dict, doc no:  3317 of length:  276797
vote dict, doc no:  3318 of length:  54437
vote dict, doc no:  3320 of length:  382891
vote dict, doc no:  3321 of length:  69176
vote dict, doc no:  3322 of length:  7

In [0]:
vote_dict

{'"': {'': 0.9},
 "'": {'': 0.9},
 '(': {'': 0.9},
 ')': {'': 0.9},
 ',': {'': 0.9, 'DATE': 1},
 '-': {'': 0.9},
 '.': {'': 0.9, 'GPE': 1},
 '..........': {'': 0.9},
 '.1973': {'': 0.9},
 '0': {'': 0.9},
 '1': {'': 0.9},
 '106': {'LAW': 1},
 '14': {'LAW': 1},
 '14(1': {'CARDINAL': 1, 'LAW': 1},
 '14(2': {'DATE': 1, 'LAW': 1},
 '148': {'CARDINAL': 1},
 '14th': {'DATE': 1},
 '15': {'LAW': 1},
 '15/-': {'': 0.9, 'CARDINAL': 1},
 '1882': {'DATE': 1},
 '1973': {'DATE': 1},
 '1974': {'DATE': 1},
 '1975': {'DATE': 1},
 '1976': {'DATE': 1},
 '2': {'CARDINAL': 1},
 '3': {'CARDINAL': 1},
 '4': {'CARDINAL': 1},
 '44': {'CARDINAL': 1},
 '45': {'DATE': 1},
 '46': {'CARDINAL': 1},
 '5': {'': 0.9},
 '6th': {'ORDINAL': 1},
 '87': {'CARDINAL': 1},
 '966': {'CARDINAL': 1},
 ':': {'': 0.9},
 ';': {'': 0.9},
 '[': {'': 0.9},
 ']': {'': 0.9},
 '^': {'': 0.9},
 'a': {'': 0.9},
 'a-54': {'': 0.9},
 'above': {'': 0.9},
 'absent': {'': 0.9},
 'accept': {'': 0.9},
 'accordance': {'': 0.9},
 'account': {'': 0.9}

### **4. Input data (without NER) for Feature Engineering**



In [0]:
# Just doing basic cleaning on text like removing weird characters, expanding contractions, removing spaces

try:
  df = pd.DataFrame()
  for i,row in temp.iterrows():
    if i % 500 == 0:
      print(i)
    summary = []
    full = []
    # For Summary
    for sent in sent_tokenize(clean_string(row["summary"]).lower()): # Clean_string - Basic Text Cleaning
        summary.append(sent)
    # For Full text
    for sent in sent_tokenize(clean_string(row["full"]).lower()):
        full.append(sent)
    
    sum_join = "*".join(summary)
    full_join = "*".join(full)

    df_obj = pd.DataFrame({"summary":[sum_join],"full":[full_join],"Id":[i]})
    # This data frame consists of Text whose format is like raw original dataframe ( summary and full ) 
    # but is processed to clean format
    df = pd.concat([df,df_obj])
except Exception as e:
  print("saving df of length",len(df))
  print(e)

0
500
1000
1500
2000
2500


In [0]:
df.head(1)

Unnamed: 0,summary,full,Id
0,s and b were sons of two brothers respectively...,civil appeal no.*8 of 1951. appeal from the ju...,0


In [0]:
df.to_csv("/content/drive/My Drive/Data/Anadi&Srijans_data.csv")