# HW08: Parsing

Remember that these homework work as a completion grade. **You can skip one section of this homework.**

In [1]:
import pandas as pd
import nltk
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df = df.sample(n=10000) # # only use 10K datapoints
df.head()

Unnamed: 0,label,title,lead,text
32500,world,Bush Surveys Hurricane Damage in Florida,"PENSACOLA, Fla. - President Bush on Sunday got...",Bush Surveys Hurricane Damage in Florida PENSA...
78570,world,Chavez allies and opposition vie for key posts...,Allies of President Hugo Chavez and opposition...,Chavez allies and opposition vie for key posts...
46351,business,Court ruling due in F1 dispute,A judge is set to deliver a ruling in a boardr...,Court ruling due in F1 dispute A judge is set ...
36741,sport,Dalmiya blames Zee and ESS,"Jagmohan Dalmiya, the president of the Indian ...","Dalmiya blames Zee and ESS Jagmohan Dalmiya, t..."
3486,business,SEC gives Google approval for public offering,The initial public offering once touted as the...,SEC gives Google approval for public offering ...


In [2]:
#TODO preprocess the corpus using spacy or load the pre-processed corpus

import spacy
nlp = spacy.load('en_core_web_sm')

df["docs"] = df["text"].apply(nlp)
print(df["docs"].head())



  from .autonotebook import tqdm as notebook_tqdm


32500    (Bush, Surveys, Hurricane, Damage, in, Florida...
78570    (Chavez, allies, and, opposition, vie, for, ke...
46351    (Court, ruling, due, in, F1, dispute, A, judge...
36741    (Dalmiya, blames, Zee, and, ESS, Jagmohan, Dal...
3486     (SEC, gives, Google, approval, for, public, of...
Name: docs, dtype: object


2022-04-29 17:27:31.439232: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-29 17:27:31.439356: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


### Information Extraction

In [3]:
doc = df["docs"].iloc[0]
print(doc)
spacy.displacy.render(doc, style="dep")


Bush Surveys Hurricane Damage in Florida PENSACOLA, Fla. - President Bush on Sunday got a firsthand look at Hurricane Ivan's devastation during his third trip to Florida to assess the damage from this summer's fierce storms in the South...


In [4]:
from collections import Counter
def flatten(t):
    return [item for sublist in t for item in sublist]

In [5]:
##TODO extract the subject-verbs pairs and print the result for the first document
def extract_subject_verb_pairs(doc):
    pairs = []
    for sent in doc.sents:
        subjs = [w for w in sent if w.dep_ == "nsubj"]
        pairs += [(w.lemma_.lower(), w.head.lemma_.lower()) for w in subjs]
    return pairs

df["svpairs"] = df["docs"].apply(extract_subject_verb_pairs)
print('subject-verb pairs of first doc:')
print(df["svpairs"].iloc[0])

##TODO create a list ranking the most common pairs and print the first 10 items
counter = Counter(flatten(df["svpairs"]))

print('\nmost common subject-verb pairs and frequency:')
for idx, (pair, count) in enumerate(counter.most_common(10), 1):
    print(f'{idx}. n={count}: {pair}')


subject-verb pairs of first doc:
[('bush', 'get')]

most common subject-verb pairs and frequency:
1. n=264: ('it', 'be')
2. n=180: ('official', 'say')
3. n=90: ('he', 'be')
4. n=76: ('that', 'be')
5. n=61: ('company', 'say')
6. n=48: ('they', 'be')
7. n=43: ('what', 'be')
8. n=39: ('who', 'be')
9. n=35: ('they', 'have')
10. n=34: ('profit', 'rise')


In [6]:
##TODO do the same for object-verb pairs ('dobj')
def extract_object_verb_pairs(doc):
    pairs = []
    for sent in doc.sents:
        objs = [w for w in sent if w.dep_ == "dobj"]
        pairs += [(w.lemma_.lower(), w.head.lemma_.lower()) for w in objs]
    return pairs

df["ovpairs"] = df["docs"].apply(extract_object_verb_pairs)
print('object-verb pairs of first doc:')
print(df["ovpairs"].iloc[0])

##TODO create a list ranking the most common pairs and print the first 10 items
counter = Counter(flatten(df["ovpairs"]))

print('\nmost common object-verb pairs and frequency:')
for idx, (pair, count) in enumerate(counter.most_common(10), 1):
    print(f'{idx}. n={count}: {pair}')

object-verb pairs of first doc:
[('look', 'get'), ('damage', 'assess')]

most common object-verb pairs and frequency:
1. n=94: ('people', 'kill')
2. n=73: ('=', 'fullquote.aspx?ticker')
3. n=42: ('point', 'score')
4. n=36: ('job', 'cut')
5. n=35: ('million', 'pay')
6. n=25: ('deal', 'sign')
7. n=21: ('agreement', 'reach')
8. n=20: ('profit', 'post')
9. n=19: ('contract', 'win')
10. n=19: ('record', 'set')


In [7]:
##TODO do the same for adjective-noun pairs ('amod')
def extract_adjective_noun_pairs(doc):
    pairs = []
    for sent in doc.sents:
        adjs = [w for w in sent if w.dep_ == "amod"]
        pairs += [(w.lemma_.lower(), w.head.lemma_.lower()) for w in adjs]
    return pairs

df["anpairs"] = df["docs"].apply(extract_adjective_noun_pairs)
print('adjective-noun pairs of first doc:')
print(df["anpairs"].iloc[0])

##TODO create a list ranking the most common pairs and print the first 10 items
counter = Counter(flatten(df["anpairs"]))

print('\nmost common adjective-noun pairs and frequency:')
for idx, (pair, count) in enumerate(counter.most_common(10), 1):
    print(f'{idx}. n={count}: {pair}')

adjective-noun pairs of first doc:
[('firsthand', 'look'), ('third', 'trip'), ('fierce', 'storm')]

most common adjective-noun pairs and frequency:
1. n=132: ('third', 'quarter')
2. n=112: ('next', 'year')
3. n=97: ('last', 'week')
4. n=95: ('first', 'time')
5. n=88: ('last', 'night')
6. n=72: ('open', 'source')
7. n=68: ('last', 'year')
8. n=64: ('chief', 'executive')
9. n=62: ('such', 'as')
10. n=59: ('presidential', 'election')


### Exploring cross label dependencies

In [8]:
##TODO extract all the subject-verbs and verbs-object pairs for the verb "win"
reference_verb = 'win'

filtered_svpairs = []
for subject, verb in flatten(df['svpairs']):
    if verb == reference_verb:
        filtered_svpairs.append((subject, verb))

filtered_ovpairs =[]
for object, verb in flatten(df['ovpairs']):
    if verb == reference_verb:
        filtered_ovpairs.append((subject, verb))


print('\nmost common subject-win pairs and frequency:')
counter = Counter(filtered_svpairs)
for idx, (pair, count) in enumerate(counter.most_common(10), 1):
    print(f'{idx}. n={count}: {pair}')

print('\nmost common object-win pairs and frequency:')
counter = Counter(filtered_ovpairs)
for idx, (pair, count) in enumerate(counter.most_common(10), 1):
    print(f'{idx}. n={count}: {pair}')



most common subject-win pairs and frequency:
1. n=11: ('he', 'win')
2. n=10: ('it', 'win')
3. n=7: ('who', 'win')
4. n=6: ('phelps', 'win')
5. n=5: ('bush', 'win')
6. n=4: ('wins', 'win')
7. n=4: ('managers', 'win')
8. n=4: ('sharapova', 'win')
9. n=4: ('they', 'win')
10. n=4: ('corp.', 'win')

most common object-win pairs and frequency:
1. n=419: ('dell', 'win')


In [9]:
##TODO for each label create a list ranking the most common subject-verbs pairs and one for the most common verbs-object pairs
labelwise_pairs = df[['label', 'svpairs', 'ovpairs']].groupby('label').sum()
labelwise_pairs = labelwise_pairs.applymap(Counter)
labelwise_pairs.head(10)

Unnamed: 0_level_0,svpairs,ovpairs
label,Unnamed: 1_level_1,Unnamed: 2_level_1
business,"{('which', 'lead'): 2, ('sec', 'give'): 1, ('i...","{('ruling', 'deliver'): 1, ('control', 'lose')..."
sci/tech,"{('browser', 'be'): 1, ('move', 'be'): 1, ('re...","{('offering', 'add'): 1, ('employee', 'add'): ..."
sport,"{('dalmiya', 'blame'): 1, ('life', 'live'): 1,...","{('zee', 'blame'): 1, ('dalmiya', 'blame'): 1,..."
world,"{('bush', 'get'): 1, ('ally', 'compete'): 1, (...","{('look', 'get'): 4, ('damage', 'assess'): 1, ..."


In [10]:
##TODO print the 10 most common pairs for each of the two lists for the labels "sport" and "business"
for label in ['business', 'sport']:
    for pairtype in ['subject', 'object']:
        print(f'\nMost common {pairtype}-verb pairs of label "{label}" with frequency:')
        
        counter = labelwise_pairs.loc[label, f'{pairtype[0]}vpairs']
        for idx, (pair, count) in enumerate(counter.most_common(10), 1):
            print(f'{idx}. n={count}: {pair}')



Most common subject-verb pairs of label "business" with frequency:
1. n=36: ('it', 'be')
2. n=33: ('profit', 'rise')
3. n=30: ('company', 'say')
4. n=25: ('official', 'say')
5. n=24: ('price', 'fall')
6. n=23: ('inc.', 'say')
7. n=19: ('corp.', 'say')
8. n=18: ('inc.', 'fullquote.aspx?ticker')
9. n=17: ('it', 'agree')
10. n=17: ('that', 'be')

Most common object-verb pairs of label "business" with frequency:
1. n=73: ('=', 'fullquote.aspx?ticker')
2. n=31: ('million', 'pay')
3. n=29: ('job', 'cut')
4. n=18: ('rate', 'raise')
5. n=17: ('profit', 'post')
6. n=16: ('profit', 'report')
7. n=15: ('agreement', 'reach')
8. n=13: ('loss', 'post')
9. n=13: ('stake', 'buy')
10. n=12: ('share', 'send')

Most common subject-verb pairs of label "sport" with frequency:
1. n=104: ('it', 'be')
2. n=53: ('he', 'be')
3. n=24: ('they', 'be')
4. n=16: ('who', 'be')
5. n=16: ('that', 'be')
6. n=15: ('they', 'have')
7. n=14: ('this', 'be')
8. n=12: ('he', 'have')
9. n=9: ('it', 'take')
10. n=9: ('official'