In [49]:
import spacy
from nltk import tokenize, word_tokenize, sent_tokenize
from nltk.util import ngrams
import pandas as pd
import stanza
from spacy.symbols import nsubj, VERB
from spacy import displacy
import textacy
from nltk.lm.preprocessing import padded_everygram_pipeline


nlp = spacy.load("en_core_web_sm")
nlp_stanford = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')

2022-02-16 12:59:36 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

2022-02-16 12:59:36 INFO: Use device: cpu
2022-02-16 12:59:36 INFO: Loading: tokenize
2022-02-16 12:59:36 INFO: Loading: pos
2022-02-16 12:59:36 INFO: Loading: lemma
2022-02-16 12:59:36 INFO: Loading: depparse
2022-02-16 12:59:38 INFO: Done loading processors!


In [164]:
with open('test.txt') as test:
    sentences = test.readlines()

for content in sentences:
    content.replace('-', ' ')
    print(content)

split_content = tokenize.sent_tokenize(content)
#print(split_content)
tokenized_content = word_tokenize(content)


## preprocessing some small things, such as punctuation inbetween a word. Widely-spoken causes the problem that spacy wants to split this, which results in problems when adding it to the dataframe.

# create dataframe where new data can be stored:
new_df = pd.DataFrame(data=tokenized_content, columns=['Tokens'])

## Token bigram extractiong:
text = 'test.txt'

with open (text, 'r') as infile:
    text = infile.readlines()
    #print(text)
    for sentences in text:
        #print(type(sentences))
        word_tokenize(sentences)
              
        tokenized_text = [list(map(str.lower, word_tokenize(sent)))
                          for sent in sent_tokenize(sentences)]
              
# Set the n-gram size
#n = 2

# Let us check what is happening on a subset: 
ngram_data, padded = padded_everygram_pipeline(2, tokenized_text)

# What is happening during padding? 
# print("PADDING:")
# print(list(padded))

# What kind of ngrams do we get? 
bigram_complete = []
#print("\n\nNGRAMS:")
for ngrams in ngram_data: 
    ngram_list = list(ngrams)
    ngram_list = [x for x in ngram_list if len(x)==2 ]
    bigram_complete.append(ngram_list)

bigram_complete_l = [x[:-1] for x in bigram_complete]    
bigram_complete_r = [x[1:] for x in bigram_complete]

#print(bigram_complete)
#     print(ngram_list)
#     print()
    
bigram_token_previous = []
for item in bigram_complete_l:
    #print(item) 
    for ele in item:
        #print(ele)
        bigram_token_previous.append(ele)
        
bigram_token_next = []
for item in bigram_complete_r:
    #print(item) 
    for ele in item:
        #print(ele)
        bigram_token_next.append(ele)
#print(bigram_token)


new_df['bigram token previous'] = bigram_token_previous
new_df['bigram token next'] = bigram_token_next


## Token trigram extraction:

text = 'test.txt'
with open (text, 'r') as infile:
    text = infile.readlines()
    #print(text)
    for sentences in text:
        #print(type(sentences))
        word_tokenize(sentences)
              
        tokenized_text = [list(map(str.lower, word_tokenize(sent)))
                          for sent in sent_tokenize(sentences)]
              
# Let us check what is happening on a subset: 
ngram_data, padded = padded_everygram_pipeline(3, tokenized_text)

# What is happening during padding? 
# print("PADDING:")
# print(list(padded))

# What kind of ngrams do we get? 
trigram_complete = []
for ngrams in ngram_data: 
    ngram_list = list(ngrams)
    ngram_list = [x for x in ngram_list if len(x)==3]
    trigram_complete.append(ngram_list)

    
trigram_complete_l = [x[:-2] for x in trigram_complete]   
trigram_complete_r = [x[2:] for x in trigram_complete]

#print(bigram_complete)
#     print(ngram_list)
#     print()
    
trigram_token_previous = []
for item in trigram_complete_l:
    #print(item) 
    for ele in item:
        #print(ele)
        trigram_token_previous.append(ele)
        
trigram_token_next = []
for item in trigram_complete_r:
    #print(item) 
    for ele in item:
        #print(ele)
        trigram_token_next.append(ele)

new_df['trigram token previous'] = trigram_token_previous
new_df['trigram token next'] = trigram_token_next


# Extract dependencies:
d_dep = []
for word in split_content:
    doc = nlp(word)
    dependency_list = []
    for item in doc:
        dependency_list.append(item.dep_)
    d_dep.append(dependency_list)


list_dep = []
for item in d_dep:
    for dep in item:
        list_dep.append(dep)

## extraction of previous and next bigrams for dependency:

ngram_data, padded = padded_everygram_pipeline(2, d_dep)

# What is happening during padding? 
# print("PADDING:")
# print(list(padded))

# What kind of ngrams do we get? 
bigram_complete = []
for ngrams in ngram_data: 
    ngram_list = list(ngrams)
    ngram_list = [x for x in ngram_list if len(x)==2 ]
    bigram_complete.append(ngram_list)

bigram_complete_l = [x[:-1] for x in bigram_complete]    
bigram_complete_r = [x[1:] for x in bigram_complete]

#print(bigram_complete)
#     print(ngram_list)
#     print()
    
bigram_dep_previous = []
for item in bigram_complete_l:
    #print(item) 
    for ele in item:
        #print(ele)
        bigram_dep_previous.append(ele)
        
bigram_dep_next = []
for item in bigram_complete_r:
    #print(item) 
    for ele in item:
        #print(ele)
        bigram_dep_next.append(ele)
#print(bigram_token)

new_df['Dependency'] = list_dep #--> store dependencies in the new_df dataframe.
new_df['bigram dep previous'] = bigram_dep_previous
new_df['bigram dep next'] = bigram_dep_next


# extraction of previous and next trigrams for dependency:

# Let us check what is happening on a subset: 
ngram_data, padded = padded_everygram_pipeline(3, d_dep)

# What is happening during padding? 
# print("PADDING:")
# print(list(padded))

# What kind of ngrams do we get? 
trigram_complete = []
for ngrams in ngram_data: 
    ngram_list = list(ngrams)
    ngram_list = [x for x in ngram_list if len(x)==3]
    trigram_complete.append(ngram_list)

    
trigram_complete_l = [x[:-2] for x in trigram_complete]   
trigram_complete_r = [x[2:] for x in trigram_complete]

#print(bigram_complete)
#     print(ngram_list)
#     print()
    
trigram_dep_previous = []
for item in trigram_complete_l:
    #print(item) 
    for ele in item:
        #print(ele)
        trigram_dep_previous.append(ele)
        
trigram_dep_next = []
for item in trigram_complete_r:
    #print(item) 
    for ele in item:
        #print(ele)
        trigram_dep_next.append(ele)

new_df['trigram dep previous'] = trigram_dep_previous
new_df['trigram dep next'] = trigram_dep_next

# Extract the head word of each node:
d = []
for word in split_content:
    doc = nlp(word)
    dependency_list = []
    for item in doc:
        dependency_list.append(item.head)
    d.append(dependency_list)

list_parent = []
for item in d:
    #print(item)
    for parent in item:
        list_parent.append(parent)
new_df['Direct Parent'] = list_parent


#Extract the POS tag of each head of the token, to show in what kind of phrasal structure it is.

d = []
for word in split_content:
    doc = nlp(word)
    dependency_list = []
    for item in doc:
        dependency_list.append(item.head.pos_)
    d.append(dependency_list)

list_parent = []
for item in d:
    #print(item)
    for parent in item:
        list_parent.append(parent)
new_df['POS Parent'] = list_parent


## Extracting Pos tag of the token itself:

d_pos = []
for word in split_content:
    doc = nlp(word)
    pos_list = []
    for item in doc:
        pos_list.append(item.pos_)
    d_pos.append(pos_list)

list_pos = []
for item in d_pos:
    #print(item)
    for pos in item:
        list_pos.append(pos)
new_df['POS Token'] = list_pos

## extraction of previous and next bigrams for pos tag:

ngram_data, padded = padded_everygram_pipeline(2, d_pos)

# What is happening during padding? 
# print("PADDING:")
# print(list(padded))

# What kind of ngrams do we get? 
bigram_complete = []
for ngrams in ngram_data: 
    ngram_list = list(ngrams)
    ngram_list = [x for x in ngram_list if len(x)==2 ]
    bigram_complete.append(ngram_list)

bigram_complete_l = [x[:-1] for x in bigram_complete]    
bigram_complete_r = [x[1:] for x in bigram_complete]

#print(bigram_complete)
#     print(ngram_list)
#     print()
    
bigram_pos_previous = []
for item in bigram_complete_l:
    #print(item) 
    for ele in item:
        #print(ele)
        bigram_pos_previous.append(ele)
        
bigram_pos_next = []
for item in bigram_complete_r:
    #print(item) 
    for ele in item:
        #print(ele)
        bigram_pos_next.append(ele)
#print(bigram_token)
new_df['bigram pos previous'] = bigram_pos_previous
new_df['bigram pos next'] = bigram_pos_next


# extraction of previous and next trigrams for pos:

# Let us check what is happening on a subset: 
ngram_data, padded = padded_everygram_pipeline(3, d_pos)

# What is happening during padding? 
# print("PADDING:")
# print(list(padded))

# What kind of ngrams do we get? 
trigram_complete = []
for ngrams in ngram_data: 
    ngram_list = list(ngrams)
    ngram_list = [x for x in ngram_list if len(x)==3]
    trigram_complete.append(ngram_list)

    
trigram_complete_l = [x[:-2] for x in trigram_complete]   
trigram_complete_r = [x[2:] for x in trigram_complete]

#print(bigram_complete)
#     print(ngram_list)
#     print()
    
trigram_pos_previous = []
for item in trigram_complete_l:
    #print(item) 
    for ele in item:
        #print(ele)
        trigram_pos_previous.append(ele)
        
trigram_pos_next = []
for item in trigram_complete_r:
    #print(item) 
    for ele in item:
        #print(ele)
        trigram_pos_next.append(ele)

new_df['trigram pos previous'] = trigram_pos_previous
new_df['trigram pos next'] = trigram_pos_next


#Extract the children of each token. Will provide an empty list when the token has no child beneath it.
d = []
for word in split_content:
    doc = nlp(word)
    children_list = [list(t.children) for t in doc]
    #print(children_list)
    d.append(children_list)
    
list_children = []
for item in d:
    #print(item)
    for child in item:        
        list_children.append(child)
new_df['Dependents'] = list_children

#WE GET THE FULL CONSTITUENTS OF EACH HEAD

d = []
for word in split_content:
    doc = nlp(word)
    constituents_list = [list(t.subtree) for t in doc]
    d.append(constituents_list)

list_constituents = []
for item in d:
    #print(item)
    for constituent in item:        
        list_constituents.append(constituent)
new_df['Constituents'] = list_constituents


display(new_df.head(20))

# for word in split_content:
#     doc = nlp(word)
#     displacy.serve(doc, style='dep')

# convert pandas df to .conll file (uncomment when ready to use)
outputfile = "test_dependency.conll"
new_df.to_csv(f'c:/Users/desir/Desktop/text_mining/applied TM/nlp technology/{outputfile}', sep='\t', header=True, quotechar='|', index=False)





A language is a structured system of communication used by humans. Languages can be based on speech and gesture, sign, or writing. The structure of language is its grammar and the free components are its vocabulary. Many languages, including the most widely spoken ones, have writing systems that enable sounds or signs to be recorded for later reactivation. Human language is unique among the known systems of animal communication in that it is not dependent on a single mode of transmission, is highly variable between cultures and across time, and affords a much wider range of expression than other systems.


Unnamed: 0,Tokens,bigram token previous,bigram token next,trigram token previous,trigram token next,Dependency,bigram dep previous,bigram dep next,trigram dep previous,trigram dep next,Direct Parent,POS Parent,POS Token,bigram pos previous,bigram pos next,trigram pos previous,trigram pos next,Dependents,Constituents
0,A,"(<s>, a)","(a, language)","(<s>, <s>, a)","(a, language, is)",det,"(<s>, det)","(det, nsubj)","(<s>, <s>, det)","(det, nsubj, ROOT)",language,NOUN,DET,"(<s>, DET)","(DET, NOUN)","(<s>, <s>, DET)","(DET, NOUN, AUX)",[],[A]
1,language,"(a, language)","(language, is)","(<s>, a, language)","(language, is, a)",nsubj,"(det, nsubj)","(nsubj, ROOT)","(<s>, det, nsubj)","(nsubj, ROOT, det)",is,AUX,NOUN,"(DET, NOUN)","(NOUN, AUX)","(<s>, DET, NOUN)","(NOUN, AUX, DET)",[A],"[A, language]"
2,is,"(language, is)","(is, a)","(a, language, is)","(is, a, structured)",ROOT,"(nsubj, ROOT)","(ROOT, det)","(det, nsubj, ROOT)","(ROOT, det, amod)",is,AUX,AUX,"(NOUN, AUX)","(AUX, DET)","(DET, NOUN, AUX)","(AUX, DET, ADJ)","[language, system, .]","[A, language, is, a, structured, system, of, c..."
3,a,"(is, a)","(a, structured)","(language, is, a)","(a, structured, system)",det,"(ROOT, det)","(det, amod)","(nsubj, ROOT, det)","(det, amod, attr)",system,NOUN,DET,"(AUX, DET)","(DET, ADJ)","(NOUN, AUX, DET)","(DET, ADJ, NOUN)",[],[a]
4,structured,"(a, structured)","(structured, system)","(is, a, structured)","(structured, system, of)",amod,"(det, amod)","(amod, attr)","(ROOT, det, amod)","(amod, attr, prep)",system,NOUN,ADJ,"(DET, ADJ)","(ADJ, NOUN)","(AUX, DET, ADJ)","(ADJ, NOUN, ADP)",[],[structured]
5,system,"(structured, system)","(system, of)","(a, structured, system)","(system, of, communication)",attr,"(amod, attr)","(attr, prep)","(det, amod, attr)","(attr, prep, pobj)",is,AUX,NOUN,"(ADJ, NOUN)","(NOUN, ADP)","(DET, ADJ, NOUN)","(NOUN, ADP, NOUN)","[a, structured, of]","[a, structured, system, of, communication, use..."
6,of,"(system, of)","(of, communication)","(structured, system, of)","(of, communication, used)",prep,"(attr, prep)","(prep, pobj)","(amod, attr, prep)","(prep, pobj, acl)",system,NOUN,ADP,"(NOUN, ADP)","(ADP, NOUN)","(ADJ, NOUN, ADP)","(ADP, NOUN, VERB)",[communication],"[of, communication, used, by, humans]"
7,communication,"(of, communication)","(communication, used)","(system, of, communication)","(communication, used, by)",pobj,"(prep, pobj)","(pobj, acl)","(attr, prep, pobj)","(pobj, acl, agent)",of,ADP,NOUN,"(ADP, NOUN)","(NOUN, VERB)","(NOUN, ADP, NOUN)","(NOUN, VERB, ADP)",[used],"[communication, used, by, humans]"
8,used,"(communication, used)","(used, by)","(of, communication, used)","(used, by, humans)",acl,"(pobj, acl)","(acl, agent)","(prep, pobj, acl)","(acl, agent, pobj)",communication,NOUN,VERB,"(NOUN, VERB)","(VERB, ADP)","(ADP, NOUN, VERB)","(VERB, ADP, NOUN)",[by],"[used, by, humans]"
9,by,"(used, by)","(by, humans)","(communication, used, by)","(by, humans, .)",agent,"(acl, agent)","(agent, pobj)","(pobj, acl, agent)","(agent, pobj, punct)",used,VERB,ADP,"(VERB, ADP)","(ADP, NOUN)","(NOUN, VERB, ADP)","(ADP, NOUN, PUNCT)",[humans],"[by, humans]"


FileNotFoundError: [Errno 2] No such file or directory: 'c:/Users/desir/Desktop/text_mining/applied TM/nlp technology/test_dependency.conll'

In [11]:

# new_d = []
# for sentences in split_content:
#     #print(sentences)
#     doc_stanford= nlp_stanford(sentences)
#     #print(doc_stanford)
#     new_d.append(doc_stanford)
# #print(new_d)



#for word in new_d:
   # print(word)

In [148]:
# text = 'test.txt'
# with open (text, 'r') as infile:
#     text = infile.readlines()
#     #print(text)
#     for sentences in text:
#         #print(type(sentences))
#         #word_tokenize(sentences)
              
#         tokenized_text = [list(map(str.lower, word_tokenize(sent)))
#                           for sent in sent_tokenize(sentences)]

# print(type(tokenized_text))

# d = []
# for sentence in tokenized_text:
#     print(sentence)
#     doc=nlp(sentence)
#     for word in sentence:
#         print(word)
#         doc = nlp(word)
#         dependency_list=[]
#         for item in doc:
#             print(item)
#             dependency = item.dep_
#             print(dependency)
#             dependency_list.append(dependency)
#             print(dependency_list)
#     d.append(dependency_list)
# print(d)


# list_dep = []
# for item in d:
#     print(item)
#     for dep in item:
#         list_dep.append(dep)

# # Let us check what is happening on a subset: 
# ngram_data, padded = padded_everygram_pipeline(3, list_dep)

# # What is happening during padding? 
# # print("PADDING:")
# # print(list(padded))

# # What kind of ngrams do we get? 
# trigram_complete = []
# print("\n\nNGRAMS:")
# for ngrams in ngram_data: 
#     ngram_list = list(ngrams)
#     ngram_list = [x for x in ngram_list if len(x)==3]
#     trigram_complete.append(ngram_list)
    
# trigram_complete_l = [x[:-2] for x in trigram_complete]   
# trigram_complete_r = [x[2:] for x in trigram_complete]

# #print(bigram_complete)
# #     print(ngram_list)
# #     print()
    
# trigram_token_previous = []
# for item in trigram_complete_l:
#     #print(item) 
#     for ele in item:
#         #print(ele)
#         trigram_token_previous.append(ele)
        
# trigram_token_next = []
# for item in trigram_complete_r:
#     #print(item) 
#     for ele in item:
#         #print(ele)
#         trigram_token_next.append(ele)

# new_df['trigram token previous'] = trigram_token_previous
# new_df['trigram token next'] = trigram_token_next

# display(new_df)


<class 'list'>
['a', 'language', 'is', 'a', 'structured', 'system', 'of', 'communication', 'used', 'by', 'humans', '.']


TypeError: Argument 'string' has incorrect type (expected str, got list)