In [1]:
import time, copy, os, pickle, glob, csv, ast
from config import parameters
from PatternHandler import PatternHandler
from DependencyGraphHandler import DependencyGraphHandler
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=parameters.num_cpus, progress_bar=True)

data_filepath = parameters.data_filepath
lexicon_filepath = parameters.lexicon_filepath
output_time_txt_filepath = parameters.output_time_txt_filepath
output_pattern_csv_filepath = parameters.output_pattern_csv_filepath
output_error_csv_filepath = parameters.output_error_csv_filepath
output_target_log_csv_filepath = parameters.output_target_log_csv_filepath
output_raw_df_pkl_filepath = parameters.output_raw_df_pkl_filepath
output_pattern_counter_pkl_filepath = parameters.output_pattern_counter_pkl_filepath
output_targets_dir = parameters.output_targets_dir
output_targets_concat_csv_filepath = parameters.output_targets_concat_csv_filepath
output_pattern_evaluation_csv_filepath = parameters.output_pattern_evaluation_csv_filepath

def match_opinion_words(content, opinion_word_lexicon):
    opinion_words = []
    for opinion in opinion_word_lexicon:
        for token in content.split():
            if token == opinion: opinion_words.append(token)
    return list(set(opinion_words))

def save_extracted_pattern_results(domain, pattern_counter, err_list):
    pattern_list = [tup for tup in pattern_counter.items()]
    pattern_df = pd.DataFrame(pattern_list, columns =['pattern', 'count'])  
    filepath = output_pattern_csv_filepath % (domain, len(pattern_df))
    pattern_df.to_csv(filepath, index = False, encoding='utf-8-sig')
    print('Created %s' % filepath)
    
    err_df = pd.DataFrame(err_list, columns =['content', 'current_opinion_word', 'current_target_word', 'parse_error', 'opinion_words', 'targets', 'raw_targets'])  
    filepath = output_error_csv_filepath % (domain, len(err_df[err_df['parse_error']==True]), len(err_df))
    err_df.to_csv(filepath, index = False, encoding='utf-8-sig')
    print('Created %s' % filepath)

def pattern_extraction(domain, df, pattern_handler, dependency_handler):
    pattern_counter, err_list = defaultdict(int), list()
    pattern_handler.extract_patterns(df, pattern_counter, err_list, dependency_handler)
    
    save_extracted_pattern_results(domain, pattern_counter, err_list)
    return pattern_counter

def merge_dfs(data_filepaths):
    dfs = []
    for data_filepath in data_filepaths:
        df = pd.read_csv(data_filepath)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

def calculate_true_positive(predicted_list, correct_list):
    tp = 0
    for predicted_compound_target in predicted_list:
        if predicted_compound_target in correct_list:   # 'screen' <- predicted 'screen'
            correct_list.remove(predicted_compound_target)
            tp += 1
            continue
        for correct_target in correct_list:
            if predicted_compound_target.find(correct_target) > -1:   # 'audio' <- predicted 'audio aspects'
                correct_list.remove(correct_target)
                tp += 1
                break
    return tp

def calculate_precision_recall(df):
    correct_targets_mul = list([item for sublist in df['targets'].values for item in sublist if item != ''])
    predicted_targets_mul = list([item for sublist in df['predicted_targets'].values for item in sublist if item != ''])
    tp_mul = calculate_true_positive(predicted_targets_mul, correct_targets_mul)
    
    if len(predicted_targets_mul) != 0: pre_mul = tp_mul / len(predicted_targets_mul)
    else: pre_mul = 0
    
    if len(correct_targets_mul) != 0: rec_mul = tp_mul / len(correct_targets_mul)
    else: rec_mul = 0
    
    correct_targets_dis = set([item for sublist in df['targets'].values for item in sublist if item != ''])
    predicted_targets_dis = set([item for sublist in df['predicted_targets'].values for item in sublist if item != ''])
    tp_dis = calculate_true_positive(predicted_targets_dis, correct_targets_dis)
    
    if len(predicted_targets_dis) != 0: pre_dis = tp_dis / len(predicted_targets_dis)
    else: pre_dis = 0
    
    if len(correct_targets_dis) != 0: rec_dis = tp_dis / len(correct_targets_dis)
    else: rec_dis = 0
    
    return pre_mul, rec_mul, pre_dis, rec_dis

def calculate_f1(precision, recall):
    denominator = precision + recall
    if denominator == 0: return 0
    return (2*precision*recall)/denominator

def save_pkl(item_to_save, filepath):
    with open(filepath, 'wb') as f:
        pickle.dump(item_to_save, f)
    print('Created %s' % filepath)

def load_pkl(filepath):
    with open(filepath, 'rb') as f:
        loaded_item = pickle.load(f)
    print('Loaded %s' % filepath)
    return loaded_item
    
def elapsed_time(start):
    end = time.time()
    elapsed_time = end - start
    elapsed_time_txt = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
    text_file = open(output_time_txt_filepath, "w", encoding='utf-8')
    content = 'Start: %s, End: %s => Elapsed time: %s\nCreated %s' % (time.strftime("%H:%M:%S", time.gmtime(start)), time.strftime("%H:%M:%S", time.gmtime(end)), elapsed_time_txt, output_time_txt_filepath)
    text_file.write(content)
    text_file.close()
    print('Created %s' % output_time_txt_filepath)

Created file: /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/output/preliminary/parameters.json
Parameters(
  base_dir = /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon
  data_filepath = /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/data/parsed/five-three_5995.json
  lexicon_filepath = /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/data/parsed/lexicon_6788.json
  output_dir = /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/output/preliminary
  output_targets_dir = /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/output/preliminary/targets
  parameters_json_filepath = /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/output/preliminary/parameters.json
  output_time_txt_filepath = /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/output/preliminary/elapsed_time.txt
  output_pattern_evaluation_csv_filepath = /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/output/preliminary/sub/[%s]pattern_evaluation.csv
  errlog_filep

In [2]:
pattern_handler = PatternHandler()
dependency_handler = DependencyGraphHandler()

if os.path.exists(output_raw_df_pkl_filepath): raw_df = load_pkl(output_raw_df_pkl_filepath)
else:
    raw_df = pd.read_json(data_filepath)
    print('Matching opinion words..')
    opinion_word_lexicon = [item for sublist in pd.read_json(lexicon_filepath).values for item in sublist]
    raw_df['opinion_words'] = raw_df.parallel_apply(lambda x: match_opinion_words(x['content'], opinion_word_lexicon), axis=1)
    print('Converting document into nlp(doc)..')
    raw_df['doc'] = raw_df.progress_apply(lambda x: pattern_handler.nlp(x['content']), axis=1)

    print('Filtering targets using nlp(doc)..')
    raw_df['targets'] = raw_df.progress_apply(lambda x: pattern_handler.process_targets(x['content'], x['raw_targets']), axis=1) 
    save_pkl(raw_df, output_raw_df_pkl_filepath)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/dmlab/stanfordnlp_resources/en_ewt_models/en_ewt.pr

In [3]:
raw_df['domain'].unique()

array(['MP3 player', 'DVD player', 'Digital camera2', 'Cell phone',
       'Digital camera1', 'Wireless router', 'Speaker', 'Computer'],
      dtype=object)

# Wireless router

In [4]:
domain = 'Wireless router'
print('Processing [%s]..' % domain)
df = raw_df[raw_df['domain']==domain]

filepath = output_pattern_counter_pkl_filepath % domain
if os.path.exists(filepath): pattern_counter = load_pkl(filepath)
else: 
    pattern_counter = pattern_extraction(domain, df, pattern_handler, dependency_handler)
    save_pkl(pattern_counter, filepath)

Processing [Wireless router]..
Loaded /home/dmlab/jihye/GIT/domain-specific-sentiment-lexicon/output/preliminary/save/[Wireless router]pattern_counter.pkl


In [11]:
import re, stanfordnlp
from nltk import pos_tag
from DependencyGraph import DependencyGraph

special_char_pattern = re.compile('([,.+]+.?\d*)')
nlp = pattern_handler.nlp
noun = ['NN', 'NNS', 'NNP']
doublespace_pattern = re.compile('\s+')

def leave_noun_only(term_list):
    term_list = [doublespace_pattern.sub(' ', special_char_pattern.sub(' ', item)) for item in term_list if item != '']   # 'sound + quality'
    term_list = [term for term, pos in pos_tag(term_list) if pos in noun and len(term) > 1]
    return term_list
    
def extract_targets(predicted_targets, doc, opinion_words, dep_rels, dependency_handler):
    if len(predicted_targets) > 0: 
        targets = predicted_targets
    else: 
        targets = set()
        for sentence_from_doc in doc.sentences:
            sentence_graph = DependencyGraph(sentence_from_doc)
            targets.update(dependency_handler.extract_targets_using_pattern(sentence_graph.token2idx, sentence_graph.nodes, opinion_words, dep_rels))

        targets = list(targets)
        targets = leave_noun_only(targets)
        targets = list(set(targets))
    #print(opinion_words, predicted_targets, '->', dep_rels, '=>', targets)
    return targets

def evaluate_rule_set(rule_set):
    df['predicted_targets'] = df.apply(lambda x: list(), axis=1)
    for one_flattened_dep_rels in rule_set:
        dep_rels = one_flattened_dep_rels.split('-')
        df['predicted_targets'] = df.apply(lambda x: extract_targets(x['predicted_targets'], x['doc'], x['opinion_words'], dep_rels, dependency_handler), axis=1)

    pre_mul, rec_mul, pre_dis, rec_dis = calculate_precision_recall(df)
    f1_mul = calculate_f1(pre_mul,rec_mul)
    f1_dis = calculate_f1(pre_dis,rec_dis)
    #print('\nrule_set=%s'%str(rule_set) , 'f1_mul=%.2f'%calculate_f1(pre_mul,rec_mul), 'f1_dis=%.2f'%calculate_f1(pre_dis,rec_dis))
    #print('pre_mul=%.2f'%pre_mul, 'rec_mul=%.2f'%rec_mul, 'pre_dis=%.2f'%pre_dis, 'rec_dis=%.2f'%rec_dis)
    return f1_mul, f1_dis

DP를 적용하되 순서를 알맞게 고려하면 성능을 더 높일 수 있음   
=> `amod nmod advmod rcmod nsubj obl => 0.63 0.43`

In [20]:
rules_ = ['amod', 'nmod', 'advmod', 'rcmod', 'nsubj', 'obl']
print('%s => %.2f %.2f' % (' '.join(rules_), *evaluate_rule_set(rules_)))

rules_ = ['nsubj', 'amod', 'obl', 'nmod', 'advmod', 'rcmod']
print('%s => %.2f %.2f' % (' '.join(rules_), *evaluate_rule_set(rules_)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


amod nmod advmod rcmod nsubj obl => 0.63 0.43
nsubj amod obl nmod advmod rcmod => 0.61 0.43


f1_mul 기준으로 랭킹 매긴 결과 참고 -> DP보다 높은 성능  
`rule_set=['nsubj', 'amod', 'nsubj-compound-compound', 'obl'] f1_mul=0.65 f1_dis=0.47`   
높은 성능을 내는 적절한 개수가 있음.

In [21]:
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'nsubj-compound-compound']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'nsubj-compound-compound', 'obl']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'nsubj-compound-compound', 'obl', 'conj-conj-nsubj']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'nsubj-compound-compound', 'obl', 'conj-conj-nsubj', 'advcl-advcl-nsubj']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.61 0.47
0.62 0.48
0.65 0.47
0.65 0.47
0.65 0.47


# Categorical data ~ mutual information

단순 F1 내림차순

In [31]:
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'obl']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'obl', 'conj-nsubj']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'obl', 'conj-nsubj', 'advmod-nsubj']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.61 0.47
0.64 0.46
0.64 0.46
0.64 0.47


redundant하지 않은 패턴을 하나씩 선택하여 추가
* nsubj amod-obj amod obl

In [36]:
print('%.2f %.2f' % evaluate_rule_set(['nsubj']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod-obj']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod-obj', 'amod']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod-obj', 'amod', 'obl']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod-obj', 'amod', 'obl', 'conj-nsubj']))   # 끝

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.39 0.35
0.40 0.36
0.62 0.49
0.64 0.46
0.63 0.45


# Explore
## Possible combanitaions

In [14]:
from itertools import compress, product

def get_combinations(items):
    return ( set(compress(items,mask)) for mask in product(*[[0,1]]*len(items)) )

In [33]:
patterns = [pattern for pattern, count in pattern_counter.items() if count > 3]
combinations = list(get_combinations(patterns))
combinations.remove(set())
len(combinations)

255

`best_rule_set={'amod', 'conj-nsubj', 'amod-obj', 'nsubj', 'advmod-nsubj'}`   
f1_mul 내림차순으로 하면 nsubj, amod, obl인데 obl은 best rule set에 뽑히지도 않음

In [16]:
max_f1_mul = 0
best_rule_set = []
rule_set_evaluation_dict = dict()
for idx, comb in enumerate(combinations):
    if idx % 50 == 0 : print('Processing %dth..' % idx)
    f1_mul, f1_dis = evaluate_rule_set(comb)
    rule_set_evaluation_dict[' '.join(comb)] = (f1_mul, f1_dis)
    if f1_mul > max_f1_mul:
        max_f1_mul = f1_mul
        best_rule_set = comb
print('max_f1_mul=%.2f \t best_rule_set=%s' % (max_f1_mul, best_rule_set))

Processing 0th..


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Processing 50th..
Processing 100th..
Processing 150th..
Processing 200th..
Processing 250th..
max_f1_mul=0.65 	 best_rule_set={'amod', 'conj-nsubj', 'amod-obj', 'nsubj', 'advmod-nsubj'}


Best rule set 안에서는 순서 상관 없음. redundant하지 않은 rule들이 뽑힌 것 같음

In [22]:
print('%.2f %.2f' % evaluate_rule_set(['amod-obj', 'nsubj', 'conj-nsubj', 'amod', 'advmod-nsubj']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'conj-nsubj', 'advmod-nsubj', 'amod-obj']))
print('%.2f %.2f' % evaluate_rule_set(['nsubj', 'amod', 'amod-obj', 'conj-nsubj', 'advmod-nsubj']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.63 0.50
0.63 0.49
0.63 0.49
