In [1]:
#FIRST, WE IMPORT EVERYTHING WE NEED

from nltk import tokenize, word_tokenize
import pandas as pd
import stanza
import spacy
from spacy.symbols import nsubj, VERB
from spacy import displacy

In [2]:
#WE LOAD THE LANGUAGE MODEL FOR SPACY

nlp = spacy.load("en_core_web_sm")

In [3]:
#WE READ THROUGH OUR TEST FILE

with open('test.txt') as test:
    sentences = test.readlines()

In [4]:
#WE GET THE SENTENCES AND THE TOKENS FROM THE TEST FILE

for content in sentences:
    content = content.replace('-',' ') #preprocessing replacing dashes with blank spaces. Dashes are difficult for the parser to handle.
    print(content)

split_content = tokenize.sent_tokenize(content)
tokenized_content = word_tokenize(content)

A language is a structured system of communication used by humans. Languages can be based on speech and gesture (spoken language), sign, or writing. The structure of language is its grammar and the free components are its vocabulary. Many languages, including the most widely spoken ones, have writing systems that enable sounds or signs to be recorded for later reactivation. Human language is unique among the known systems of animal communication in that it is not dependent on a single mode of transmission, is highly variable between cultures and across time, and affords a much wider range of expression than other systems.


In [5]:
#WE CREATE A DATAFRAME TO DISPLAY THE FEATURES

new_df = pd.DataFrame(data = tokenized_content, columns=['Tokens'])

In [6]:
#WE GET THE DEPENDENCY OF EACH TOKEN TO ITS HEAD/ROOT and WE ADD IT TO THE DATAFRAME

d = []
for word in split_content:
    doc = nlp(word)
    dependency_list = []
    for item in doc:
        dependency_list.append(item.dep_)
    d.append(dependency_list) 

list_dep = []
for item in d:
    for dep in item:
        list_dep.append(dep)
new_df['relation_to_head_spacy'] = list_dep

In [7]:
#WE GET THE DIRECT HEAD OF EACH TOKEN

d = []
for word in split_content:
    doc = nlp(word)
    dependency_list = []
    for item in doc:
        dependency_list.append(item.head)
    d.append(dependency_list)

list_parent = []
for item in d:
    #print(item)
    for parent in item:
        list_parent.append(parent)
new_df['Parent'] = list_parent

In [8]:
#WE GET THE PoS TAG OF THE DIRECT HEAD OF EACH TOKEN
#This is meant to capture the phrasal structure better

d = []
for word in split_content:
    doc = nlp(word)
    dependency_list = []
    for item in doc:
        dependency_list.append(item.head.pos_)
    d.append(dependency_list)

list_parent = []
for item in d:
    #print(item)
    for parent in item:
        list_parent.append(parent)
new_df['POS Head'] = list_parent

In [9]:
#WE GET THE CHILD(REN) OF EACH TOKEN, IF EXISTING 
#This is only the dependency, not the constituent

d = []
for word in split_content:
    doc = nlp(word)
    children_list = [list(t.children) for t in doc]
    #print(children_list)
    d.append(children_list)
    
list_children = []
for item in d:
    #print(item)
    for child in item:        
        list_children.append(child)
new_df['Children'] = list_children

In [10]:
#WE GET THE FULL CONSTITUENTS OF EACH HEAD

d = []
for word in split_content:
    doc = nlp(word)
    constituents_list = [list(t.subtree) for t in doc]
    d.append(constituents_list)

list_constituents = []
for item in d:
    #print(item)
    for constituent in item:        
        list_constituents.append(constituent)
new_df['Constituents'] = list_constituents

In [11]:
#WE DISPLAY THE RESULTS IN A DATAFRAME 

display(new_df.head(30))

Unnamed: 0,Tokens,relation_to_head_spacy,Parent,POS Head,Children,Constituents
0,A,det,language,NOUN,[],[A]
1,language,nsubj,is,AUX,[A],"[A, language]"
2,is,ROOT,is,AUX,"[language, system, .]","[A, language, is, a, structured, system, of, c..."
3,a,det,system,NOUN,[],[a]
4,structured,amod,system,NOUN,[],[structured]
5,system,attr,is,AUX,"[a, structured, of]","[a, structured, system, of, communication, use..."
6,of,prep,system,NOUN,[communication],"[of, communication, used, by, humans]"
7,communication,pobj,of,ADP,[used],"[communication, used, by, humans]"
8,used,acl,communication,NOUN,[by],"[used, by, humans]"
9,by,agent,used,VERB,[humans],"[by, humans]"


In [12]:
#WE CONVERT THE PANDAS DATAFRAME TO A CONLL FILE

#outputfile = "test_dependency.conll"
#new_df.to_csv(f'c:/Users/desir/Desktop/text_mining/applied TM/{outputfile}', sep='\t', header=True, quotechar='|', index=False)