In [4]:
from config import parameters
import pandas as pd

def match_opinion_words(content, opinion_word_lexicon):
    opinion_words = []
    for opinion in opinion_word_lexicon:
        for token in content.split():
            if token == opinion: opinion_words.append(token)
    return list(set(opinion_words))

df = pd.read_json(parameters.data_filepath)
opinion_word_lexicon = [item for sublist in pd.read_json(parameters.lexicon_filepath).values for item in sublist]
df['opinion_words'] = df.apply(lambda x: match_opinion_words(x['content'], opinion_word_lexicon), axis=1)
df.head()

Unnamed: 0,content,raw_targets,filename,domain,opinion_words
0,"this is an edited review , now that i have had...",[],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,[]
1,"while , there are flaws with the machine , the...",[affordability],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,[flaws]
10,"for me , i 'll save that cash for another purc...",[],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,[]
100,"the face-plate , which pops out for the batter...",[face plate],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,[]
1000,after i reviewed the manual ( a pdf file inclu...,[],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,"[warned, problems, better]"


In [43]:
import stanfordnlp
from DependencyGraph import DependencyGraph
def next_focused_tokens(current_tokens, token2idx, nodes, dep_rel):
    focused_tokens = set()
    for current_token in current_tokens:
        indices = token2idx[current_token]
        for current_token_idx in indices:

            if nodes[current_token_idx].dep == dep_rel: focused_tokens.add(nodes[nodes[current_token_idx].governor].token)
            child_nodes = [nodes[i] for i in range(len(nodes)) if nodes[i].governor==current_token_idx]
            focused_tokens.update([child_node.token for child_node in child_nodes if child_node.dep == dep_rel])
    return focused_tokens

def extract_targets_using_pattern(token2idx, nodes, opinion_words, dep_rels):

    focused_tokens = opinion_words
    for i in range(len(dep_rels)):
        focused_tokens = next_focused_tokens(focused_tokens, token2idx, nodes, dep_rels[i])
    
    return set(list(focused_tokens))

nlp = stanfordnlp.Pipeline()

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt.pr

In [24]:
import os
filepath = os.path.join(os.path.dirname(os.getcwd()), 'output', 'preliminary', 'sub', '[MP3 player]patterns_496.csv')
pattern_df = pd.read_csv(filepath)

min_pattern_count = 2
domain_pattern_counter = {k:v for k, v in zip(pattern_df['pattern'], pattern_df['count']) if v >= min_pattern_count}

In [47]:
domain_df = df[df['domain']==df['domain'].unique()[0]].iloc[:100]

In [48]:
one_flattened_dep_rels = list(domain_pattern_counter.keys())[0]
dep_rels = one_flattened_dep_rels.split('-')

def extract_targets(content, opinion_words):
    doc = nlp(content)
    targets = set()
    for sentence_from_doc in doc.sentences:
        sentence_graph = DependencyGraph(sentence_from_doc)
        targets.update(extract_targets_using_pattern(sentence_graph.token2idx, sentence_graph.nodes, opinion_words, dep_rels))
    return list(targets)


domain_df['predicted_targets'] = domain_df.apply(lambda x: extract_targets(x['content'], x['opinion_words']), axis=1)



In [49]:
domain_df.head()

Unnamed: 0,content,raw_targets,filename,domain,opinion_words,predicted_targets
0,"this is an edited review , now that i have had...",[],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,[],{}
1,"while , there are flaws with the machine , the...",[affordability],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,[flaws],{}
10,"for me , i 'll save that cash for another purc...",[],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,[],{}
100,"the face-plate , which pops out for the batter...",[face plate],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,[],{}
1000,after i reviewed the manual ( a pdf file inclu...,[],Creative Labs Nomad Jukebox Zen Xtra 40GB,MP3 player,"[warned, problems, better]",{}
