In [27]:
import spacy
import pandas as pd
pd.options.mode.chained_assignment = None
nlp = spacy.load("en_core_web_lg")

In [28]:
def get_pos(str_to_tag, nlp):
    #Inputs:
    #str_to_tag: string to tag
    #nlp: spacy NLP model

    #Returns:
    #a Pandas data frame where each row is a token found by the spaCy tokenizer.
    #Columns are: text: the text associated with the token; lemma: the morpheme; pos: broad part of speech, 
    # tag: the narrow part of speech tag (Penn tree bank)  
    doc = nlp(str_to_tag)
    return(pd.DataFrame([{'text':token.text, 'lemma':token.lemma_, 'pos':token.pos_, 'tag':
          token.tag_} for token in doc]))

In [29]:
get_pos('what did the angry bus driver say?', nlp)

Unnamed: 0,lemma,pos,tag,text
0,what,PRON,WP,what
1,do,AUX,VBD,did
2,the,DET,DT,the
3,angry,ADJ,JJ,angry
4,bus,NOUN,NN,bus
5,driver,NOUN,NN,driver
6,say,VERB,VB,say
7,?,PUNCT,.,?


In [32]:
# finding adjectives in a CSV of utterances
utterances = pd.read_csv('providence_long_utts.csv')

adjectives = []
for utterance in utterances.to_dict('records')[1:10]:
    tagged_utterance = get_pos(utterance['text'], nlp)
    adjs_in_utterance = tagged_utterance[tagged_utterance.tag == 'JJ'] 
        #could be more than one adjective in each utterance
    adjs_in_utterance['utterance'] = utterance['text'] # "propagate metadata"
    adjs_in_utterance['utterance_id'] = utterance['id']
    adjectives.append(adjs_in_utterance)

adjectives_df = pd.concat(adjectives)
adjectives_df.to_csv('found_adjectives.csv', index=False)	