# Importing Libraries

In [2]:
#importing spacy
import spacy
import en_core_web_sm
nlp = spacy.load('en_core_web_sm')

In [3]:
#importing neuralcoref
import neuralcoref
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

# 'that'

In [3]:
#importing spacy symbols for nouns and verbs
from spacy.symbols import nsubj, VERB
#function to check whether the sentence contains that followed by a verb
def checkThat(text):
    doc=nlp(text)
    flag=0          #flag to check if 'that' is present in the sentence
    v_flag=0        #flag to check if 'that' is immediately followed by a verb
    present=False   #flag to indicate whether the sentence falls into the same category
    for word in doc:    #passing nlp object as argument
        if word.text=='that':
            flag=1
        elif flag==1 and word.pos==VERB:
            present=True
            break
        else:
            flag=0
    #print(present)
    return present

In [4]:
#function to break sentence containing 'that' into simpler sentences
def resolveThat(text):
    
    doc=nlp(text)
    #finding all the noun chunks
    chunks = list(doc.noun_chunks)
    
    #finding the noun chunk closest to 'that'
    min=-99999

    for ner in chunks:
        position=text.find(ner.text)
        if text.find('that')-position>0:
            if position>min:
                min=position
                f_ner=ner.text
    
    #breaking into simple sentences
    sent1=''
    sent2=f_ner
    text=text.split()
    flag=0
    for word in text:
        if word!='that' and flag==0:
            sent1=sent1+" "+word
        if word=='that':
            sent1=sent1+'.'
            flag=1
        if word!='that' and flag==1:
            sent2=sent2+" "+word
    sent=sent1+" "+sent2
    #print(sent)
    return sent

In [5]:
text='He bought the shoes that were in the shop window. They were very pretty'
if checkThat(text)==True:
    sent=resolveThat(text)
print(sent)
doc=nlp(sent)
doc._.coref_clusters

 He bought the shoes. the shoes were in the shop window. They were very pretty


[the shoes: [the shoes, the shoes, They]]

# Removing some stopwords(Generally,However,So)

In [6]:
import string
#function to remove the stopwords
def remStopwords(text):
    stopwords =['So,','Generally,','However,']
    text = ' '.join([word for word in text.split() if word not in stopwords]) 
    return text

In [7]:
#function to check the exisence of multiple comma
#However it may break sentences conatining multiple items separated with comma
def checkMultipleComma(text):
    punct_list=[]
    for i in text:
        if i in string.punctuation:
            punct_list.append(i)
    #print(punct_list)
    if punct_list.count(',')==2:
        return True
    else:
        return False

In [8]:
def removeExtraInfo(text):
    flag=0
    new_text=''
    for i in text:
        if i==',' and flag==0:
            flag=1
        elif i==',' and flag==1:
            flag=0
        elif flag==0:
            #print(i)
            new_text=new_text+i
    return new_text


In [9]:
text='Generally, swap space is allocated as a chunk of disk, separate from the file system, so that its use is as fast as possible.'
text=remStopwords(text)
if checkMultipleComma(text)==True:
    text=removeExtraInfo(text)
doc=nlp(text)
doc._.coref_clusters

[swap space: [swap space, its]]

# Resolving 'Because', 'As' ...... in the beginning of the sentnece

In [10]:
#function to check if the sentence starts with 'Because','As',etc
intro_words=['Because','However','Although','As','After','Awhile']
def checkIntroductoryWords(text):
    if text==' ':
        return False
    words=text.split()
    #print(words)
    if words!=[] and words[0] in intro_words:
        return True
    else:
        return False

In [11]:
#function to remove because
def remIntroductoryWords(sent):
    words=sent.split()
    sent=""
    for word in words:
        if word not in intro_words:
            sent=sent+word+" "
    #print(sent)
    text=''
    for word in sent:
        #print(word)
         if word==',':
             text=text+'.'
         else:
             text=text+word
    #print(text)
    return text

In [12]:
text='Because my coffee was too cold, I heated it in the microwave. ' 
if checkIntroductoryWords(text)==True:
    text=remIntroductoryWords(text)
doc=nlp(text)
doc._.coref_clusters

[my coffee: [my coffee, it]]

# Reading the data and processing it

In [21]:
#opening file and reading from it
f=open('os_chap1.txt','r')
text=f.read()

In [22]:
#splitting into paragraphs
paras=text.split('\n')
#print(paras)

In [23]:
#splitting each para into sentences and processing it
from nltk.tokenize import sent_tokenize
processed_paras=[]
for para in paras:
    sentences=para.split('.')
    #print(sentences)
    print("\n")
    new_para=''
    #processing the individual sentences in a paragraph
    for text in sentences:
        #print("1",text)
        if text=="\n" or text=='':
            continue
        text=remStopwords(text)
        if checkThat(text)==True:
            text=resolveThat(text)
        elif checkMultipleComma(text)==True:
            text=removeExtraInfo(text)
        elif checkIntroductoryWords(text)==True:
            text=remIntroductoryWords(text)
        text=text+'.'
        new_para=new_para+' '+text
    print(new_para)
    processed_paras.append(new_para)
    #para=new_para



  An operating system is a program. a program manages a computer’s hardware. It also provides a basis for application programs and acts as an intermediary between the computer user and the computer hardware. An amazing aspect of operating systems is how they vary in accomplishing these tasks. Mainframe operating systems are designed primarily to optimize utilization of hardware. Personal computer (PC) operating systems support complex games and everything in between. Operating systems for mobile com- puters provide an environment in which a user can easily interface with the computer to execute programs. Thus, some operating systems are designed to be convenient, others to be efficient, and others to be some combination of the two.


 Before we can explore the details of computer system operation, we need to know something about system structure. We thus discuss the basic functions of system startup and storage early in this chapter.  We also describe the basic computer architecture.

In [24]:
#coref resolution after processing the text
full_text=''
for para in processed_paras:
    #print(para)
    full_text=full_text+para
    doc=nlp(para)
    print(doc._.coref_clusters)
    #print("\n")

[a program: [a program, It], operating systems: [operating systems, they]]
[we: [we, we, We, We], the basic computer architecture: [the basic computer architecture, it]]
[ an operating system: [ an operating system, it, the system], we: [we, we]]
[We: [We, our]]
[the system: [the system, The operating system, its]]
[a computer system: [a computer system, The operating system], An operating system: [An operating system, it, itself, It]]
[]
[]
[]
[]
[]
[Other users: [Other users, These users]]
[users: [users, These users, their, they, their], Most mobile computers: [Most mobile computers, they]]
[]
[ Some computers: [ Some computers, they, their]]
[A computer system: [A computer system, The operating system, the operating system, it, the computer system], many resources: [many resources, these resources], numerous and possibly conflicting requests for resources: [numerous and possibly conflicting requests for resources, them]]
[]
[]
[]


In [25]:
doc._.coref_resolved

''

In [26]:
doc=nlp(full_text)
print(doc._.coref_clusters)

[a program: [a program, It], operating systems: [operating systems, they], we: [we, we, We, We], this chapter: [this chapter, this chapter], the basic computer architecture: [the basic computer architecture, it], an operating system: [an operating system, it, the system], we: [we, we, We, our], the operating system: [the operating system, the operating system, The operating system, its, The operating system, the operating system, the system], A computer system: [A computer system, the system], An operating system: [An operating system, it, itself, It], Such a system: [Such a system, its], Other users: [Other users, These users, These users, their, they, their], Most mobile computers: [Most mobile computers, they], the system: [the system, the operating system], Some computers: [Some computers, they, their], the computer: [the computer, the computer], A computer system: [A computer system, The operating system, the operating system, it, the computer system], many resources: [many resour

In [27]:
doc._.coref_resolved

'  An operating system is a program. a program manages a computer’s hardware. a program also provides a basis for application programs and acts as an intermediary between the computer user and the computer hardware. An amazing aspect of operating systems is how operating systems vary in accomplishing these tasks. Mainframe operating systems are designed primarily to optimize utilization of hardware. Personal computer (PC) operating systems support complex games and everything in between. Operating systems for mobile com- puters provide an environment in which a user can easily interface with the computer to execute programs. Thus, some operating systems are designed to be convenient, others to be efficient, and others to be some combination of the two. Before we can explore the details of computer system operation, we need to know something about system structure. we thus discuss the basic functions of system startup and storage early in this chapter.  we also describe the basic comput

In [19]:
#coref resolution without processing the text
f=open('os_chap1.txt','r')
text=f.read()
doc=nlp(text)
doc._.coref_clusters

[An operating system: [An operating system, It],
 operating systems: [operating systems, they],
 we: [we, we, We, We],
 this chapter: [this chapter, this chapter],
 an operating system: [an operating system, it, the system],
 we: [we, we, We, our],
 the operating system: [the operating system, the operating system, The operating system, its, The operating system, the operating system, the system, the operating system],
 A computer system: [A computer system, the system],
 An operating system: [An operating system, it, itself, It],
 Such a system: [Such a system, its],
 Other users: [Other users, These users, These users, their, they, their],
 O: [O, her],
 Most mobile computers: [Most mobile computers, they],
 embedded computers in home devices and automobiles: [embedded computers in home devices and automobiles, they, their],
 the computer: [the computer, the computer],
 A computer system: [A computer system, The operating system, the operating system, it, the computer system],
 many 

In [20]:
doc._.coref_resolved

'An operating system is a program that manages a computer’s hardware. An operating system also provides a basis for application programs and acts as an intermediary between the computer user and the computer hardware. An amazing aspect of operating systems is how operating systems vary in accomplishing these tasks. Mainframe operating systems are designed primarily to optimize utilization of hardware. Personal computer (PC) operating systems support complex games, business applications, and everything in between. Operating systems for mobile com- puters provide an environment in which a user can easily interface with the computer to execute programs. Thus, some operating systems are designed to be convenient, others to be efficient, and others to be some combination of the two.\nBefore we can explore the details of computer system operation, we need to know something about system structure. we thus discuss the basic functions of system startup, I/O, and storage early in this chapter.  

# Resolving text containing 'it'

In [21]:
#function to check whether a sentence containing 'it' 
def checkImpersonalVerb(sent):
    i_verb=('raining','snowing','cloudy','Sunday','sunday','Monday','monday','Tuesday','tuesday','Wednesday','wednesday',
            'Thursday','thursday','Friday','friday','Saturday','saturday','Sunday','sunday','clock','sunny')
    words=sent.split()
    for word in words:
        if word in i_verb:  # if the sentence contains any of the above words, then we don't resolve such sentences
            return True
    return False

In [42]:
def checkEmphasizer(sent):
    words=sent.split(' ')
    if words[0]=='It' and 'who' in words:
        return True
    return False

In [38]:
def checkProvisionalSubject(sent):
    words=sent.split(' ')
    if 'to' in words:
        return True
    else:
        return False

In [50]:
def checkObj(sent):
    objects=('baby','dog','cow','cat','horse','bottle','glass','meat','jar','box','meat','mug','milk')  #there can be more such words
    words=sent.split()
    for word in words:
        if word in objects:  #if the previous sentence contain any of the above words, the 'it' has to be replaced with it
            return word      #so return the word which will replace 'it'
    return 'no'

In [54]:
text='I saw a baby crying.It must be hungry' 
sentences=text.split('.')
i=0                                          #used to traverse through the sentences
new_text=''                                  #to store the new text after replacing 'it'
for sent in sentences:                       #traversing throught the list of sentences
    words=sent.split(' ')                    #splitting the sentence into words
    new_sent=''                              #to store the new sentence if it contains 'it' by replacing it 
    
    #An example of provosional subject: It is not easy to defeat him.
    #We don't need to resolve such sentences
    if sent!='' and words[0]=='It':          #check whether the first word of the sentence is 'it'
        
        if checkImpersonalVerb(sent):        #check if the sentence contain Impersonal verb
            print('Impersonal verb,No need to resolve')      #e.g It is raining, It is 7 o'clock, It is Sunday
            
        if checkProvisionalSubject(sent):    #check if the sentence contains any provisional subject
            print('Provisional subject,No need to resolve')      #if it contains any provisional subject then no need to resolve 'it'
            
        if checkEmphasizer(sent):            #check if the sentence is an emphasizer e.g It was Suzie who stole the bread
            print('Emphasizer,No need to resolve')   #No need to resolve such sentences
            
            
            
        if checkObj(sentences[i-1])!='no':   #check if the previous sentence contains any reference to an animal or a baby
            obj=checkObj(sentences[i-1])     #if it contains such reference then return that reference(animal/baby)
            new_sent='The '+obj+' '+' '.join(word for word in words[1:])+'.'   #replacing it with the refernce to the animal/baby returned
        
        
        else:
            new_sent=' '.join(word for word in words)+'.'
            
            
    elif sent!='':
        new_sent=' '.join(word for word in words)+'.'   #otherwise keep the sentence as it is
    #print(new_sent)
    new_text=new_text+new_sent                          #add the new(or old if not changed) sentence to the original text
    i=i+1
print(new_text)

I saw a baby crying.The baby must be hungry.
