In [2]:
import spacy
from collections import Counter

#!python -m spacy download en_core_web_sm -q
nlp = spacy.load("en_core_web_sm")
text = "Narendra Modi was born in Vadnagar, Gujarat. He served as the 14th Prime Minister of India."

In [5]:
def extract_features(sentence):
    """
    Extracts various NLP features from a sentence.

    Args:
    sentence (str): The input sentence.

    Returns:
    dict: A dictionary containing POS tags, named entity tags, and dependency features and additional feattures for each word.
    The dictionary has the following structure:
    {   - pos_tags: POS tag of the word.
        - named_entity_tags: Named entity tag of the word.
        - headword: The headword of the token.
        - dependency_relation: The dependency relation of the token.
        - path_to_root: The path to the root of the token.
        - leftmost_child_pos: The POS of the leftmost child of the token.
        - rightmost_child_pos: The POS of the rightmost child of the token.
        - num_ancestors: The number of ancestors of the token.
        - num_children: The number of children of the token.
        - polarity: The polarity of the token.
        - word_frequency: The frequency of a word in text.
        - shape: A transformation of the token text into a form that generalizes the case of each character.
        - current_word: current word.
        - next_word: next word (bigram).
    }
    """
    doc = nlp(sentence)
    features = {}

    # Count word frequencies in the sentence
    word_freq = Counter([token.text for token in doc])
    
    for i,token in enumerate(doc):

        next_word = doc[i + 1].text if i + 1 < len(doc) else None

        features[token.text] = {
            'pos_tag': token.pos_,
            'named_entity_tag': token.ent_type_ if token.ent_type_ else 'O', # 'O' means 'Other' or no named entity tag
            'headword': token.head.text,
            'dependency_relation': token.dep_,
            'path_to_root': [t.text for t in token.ancestors],
            'leftmost_child_pos': token.left_edge.pos_ if token.left_edge else None,
            'rightmost_child_pos': token.right_edge.pos_ if token.right_edge else None,
            'num_ancestors': len(list(token.ancestors)),
            'num_children': len(list(token.children)),
            'polarity': token.sentiment,
            'full_constituent': [child.text for child in token.children],
            'lemma': token.lemma_,
            'is_stop': token.is_stop,
            'is_punct': token.is_punct,
            'word_frequency': word_freq[token.text], 
            'shape': token.shape_,
            'current_word': token.text,
            'next_word': next_word   
        }
    return features

res = extract_features(text)
print(res)

{'Narendra': {'pos_tag': 'PROPN', 'named_entity_tag': 'PERSON', 'headword': 'Modi', 'dependency_relation': 'compound', 'path_to_root': ['Modi', 'born'], 'leftmost_child_pos': 'PROPN', 'rightmost_child_pos': 'PROPN', 'num_ancestors': 2, 'num_children': 0, 'polarity': 0.0, 'full_constituent': [], 'lemma': 'Narendra', 'is_stop': False, 'is_punct': False, 'word_frequency': 1, 'shape': 'Xxxxx', 'current_word': 'Narendra', 'next_word': 'Modi'}, 'Modi': {'pos_tag': 'PROPN', 'named_entity_tag': 'PERSON', 'headword': 'born', 'dependency_relation': 'nsubjpass', 'path_to_root': ['born'], 'leftmost_child_pos': 'PROPN', 'rightmost_child_pos': 'PROPN', 'num_ancestors': 1, 'num_children': 1, 'polarity': 0.0, 'full_constituent': ['Narendra'], 'lemma': 'Modi', 'is_stop': False, 'is_punct': False, 'word_frequency': 1, 'shape': 'Xxxx', 'current_word': 'Modi', 'next_word': 'was'}, 'was': {'pos_tag': 'AUX', 'named_entity_tag': 'O', 'headword': 'born', 'dependency_relation': 'auxpass', 'path_to_root': ['bor