In [1]:
import os
import sys
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
text_processing_scripts_path = os.path.abspath('TextProcessingScripts')
sys.path.append(text_processing_scripts_path)

scripts_path = os.path.abspath('Scripts')
sys.path.append(scripts_path)

In [3]:
from FeatureExtraction.VocabDict import create_vocab_dict
from FeatureExtraction.UnknownWordsProcessing import UnknownWordsProcessing
from FeatureExtraction.Word2VecUtilities import create_word2vector_model, create_embeddings_matrix, save_word2vector_model, load_word2vector_model
from TensorflowInputProcessing.SentenceProcessing import SentenceProcessing
from TensorflowInputProcessing.DocumentProcessing import DocumentProcessing
from TensorflowInputProcessing.MapWordToID  import MapWordToID 
from CommonUtilities.FileUtilities import return_file_content, save_pickle_file, load_pickle_file


In [4]:
from CustomNN import create_RNN, create_attention
from CustomRNN import CustomRNN
from LengthEstimation import estimate_sentences_and_document_lengths

In [5]:
data_path = os.path.abspath('data_copy')
train_data = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [6]:
# test_data_copy = pd.DataFrame([test_data['company1'],test_data['company2'], test_data['is_parent'],test_data['snippet']]).T
# test_data_copy.columns = ['company2','company1','is_parent','snippet']
# test_data = pd.concat([test_data,test_data_copy])

In [7]:
def preprocess_and_group_data(data):
    data = data.drop_duplicates()
    data.index = range(len(data))
    aliased_snippet = []
    for i in range(len(data)):
        aliased_snippet.append(data['snippet'][i].replace(data['company2'][i],' company2 ').replace(data['company1'][i],' company1 '))
    data['snippet'] = aliased_snippet

    data['snippet'] = data['snippet'].str.lower()

    grouped_data = data.groupby(['company1','company2'])['snippet'].apply(list)
    grouped_data = grouped_data.to_frame().reset_index()
    return data, grouped_data

def word_tokenizer(string):
    return string.split()

In [8]:
test_data, grouped_test_data = preprocess_and_group_data(test_data)

In [9]:
training_params = load_pickle_file(os.path.join(data_path, 'training_params.p'))

In [10]:
vocab_dict = training_params['vocab_dict'] 
rev_vocab_dict = training_params['rev_vocab_dict']
estimated_sent_len = training_params['estimated_sent_len']
estimated_doc_len = training_params['estimated_doc_len']
embedding_matrix = training_params['embedding_matrix']

In [11]:
sentence_processing = SentenceProcessing()
document_processing = DocumentProcessing()
map_word_to_id = MapWordToID(vocab_dict)
unknown_words_processing = UnknownWordsProcessing(vocab_list=vocab_dict.keys(), replace=False)


In [12]:
def return_X(grouped_snippets):
    tokenized_sentences_tokenized_words = [word_tokenizer(sent) for sent in grouped_snippets]
    tokenized_sentences_tokenized_words = unknown_words_processing.remove_or_replace_unkown_word_from_sentences(tokenized_sentences_tokenized_words)
    preprocessed_sentences_of_document = sentence_processing.pad_truncate_sent(tokenized_sentences_tokenized_words, estimated_sent_len,  dummy_token='my_dummy')
    preprocessed_document = document_processing.pad_truncate_document(preprocessed_sentences_of_document, estimated_doc_len, estimated_sent_len)
    id_array = np.asarray(map_word_to_id.word_lists_to_id_lists(preprocessed_document))
    return id_array

In [13]:
X_test = []
y_test = []
for index , row in grouped_test_data.iterrows():
    grouped_snippets = row['snippet']
    id_array = return_X(grouped_snippets)
    X_test.append(id_array)
    y_test.append(0)

In [14]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected

In [15]:
tf.reset_default_graph()

In [16]:
sess = tf.Session()
saver = tf.train.import_meta_graph(os.path.join(data_path,"consent.ckpt.meta"))
saver.restore(sess, os.path.join(data_path,"consent.ckpt"))
graph = tf.get_default_graph()

X = graph.get_operation_by_name('Inputs/X').outputs[0]
y = graph.get_operation_by_name('Inputs/y').outputs[0]
tf_keep_prob = graph.get_operation_by_name('Inputs/tf_keep_prob').outputs[0]
tf_sentences_length = graph.get_operation_by_name('Inputs/sentences_length').outputs[0]
tf_documents_length = graph.get_operation_by_name('Inputs/documents_length').outputs[0]
normalized_sentence_attentions = graph.get_operation_by_name('Attention-2/ExpandDims').outputs[0]
prob = graph.get_operation_by_name('Prediction/prob').outputs[0]


'model_variables' collection should be of type 'byte_list', but instead is of type 'node_list'.
INFO:tensorflow:Restoring parameters from /home/radoslav/git/team_a/vicky/data_copy/consent.ckpt


In [17]:
# validation_data = load_pickle_file(os.path.join(data_path, 'validation_data.p'))
# X_valid = validation_data['X_valid']
# y_valid = validation_data['y_valid']

In [18]:
X_valid_samples, y_valid_samples = np.asarray(X_test), np.asarray(y_test).reshape(len(y_test),1)
valid_sentences_length, valid_documents_length = estimate_sentences_and_document_lengths(X_valid_samples, vocab_dict['my_dummy'])

In [19]:
np_normalized_sentence_attentions, np_prob, np_y = sess.run([normalized_sentence_attentions, prob, y],
                                                            feed_dict={X:X_valid_samples, y:y_valid_samples,tf_sentences_length:valid_sentences_length, tf_documents_length:valid_documents_length,tf_keep_prob:1})
attention_scores = np.squeeze(np_normalized_sentence_attentions)

In [34]:
np_y = y_valid_samples
accuracy = sum((np_prob>0.5)==(np_y>0.5))/len(np_y)
accuracy

array([0.90377113])

In [21]:
_, grouped_test_data = preprocess_and_group_data(test_data)
grouped_test_data['prob'] = np_prob
grouped_test_data['imp_sent_num'] = np.argmax(attention_scores,1)

grouped_test_data_reduced = grouped_test_data[np_prob>0.98]
grouped_test_data_reduced.index= range(len(grouped_test_data_reduced))

In [37]:
grouped_test_data.shape

(769, 5)

In [39]:
grouped_test_data

Unnamed: 0,company1,company2,snippet,prob,imp_sent_num
0,ALFA_(Mexico),Axtel,"[ assets of its energy unit newpek, but that i...",2.156570e-04,0
1,ARIAD_Pharmaceuticals,Takeda_Pharmaceutical_Company,[japan's company2 a ready for fresh acquisitio...,6.837936e-04,18
2,AT&T,Boeing,"[ \n""immigrants or their children founded mor...",1.058349e-04,17
3,AT&T,Boost_Mobile,"[ they're on different carriers: company1 , v...",1.891391e-06,6
4,AT&T,Bright_House_Networks,[o) purchase of time warner cable and company...,1.733657e-03,0
5,AT&T,Buick,[\nthe sponsorship from monster would add to h...,3.907995e-06,0
6,AT&T,CBS_Corporation,"[ ( twx ) didn't have one either, until ceo je...",3.181220e-04,2
7,AT&T,CBS_Interactive,"[\n""that's crazy,"" said the company1 worker ...",9.958584e-01,0
8,AT&T,Calvin_Klein,[ they are industry professionals and have pro...,4.222701e-06,0
9,AT&T,Cricket_Wireless,"[ nevertheless, if you're interested, it'll wo...",4.575273e-05,2


In [38]:
grouped_test_data.to_csv("../Radi/onto-test/vicky-test.csv")

In [22]:
top_results = grouped_test_data.sort_values('prob', ascending=False)[:80]
top_results.index = range(len(top_results))

In [23]:
top_results

Unnamed: 0,company1,company2,snippet,prob,imp_sent_num
0,Danaher_Corporation,Pall_Corporation,[ company2 is being acquired by company1 (...,0.999997,0
1,Alibaba_Group,AutoNavi,[ but when users of the app attempt to use its...,0.999997,2
2,Berkshire_Hathaway,NV_Energy,[ buffett and musk have also competed in other...,0.999995,3
3,UniCredit,HypoVereinsbank,[ commerzbank has around 8 percent of the germ...,0.999995,2
4,Oracle_Corporation,MICROS_Systems,"[ company1 acquired micros in 2014, when it ...",0.999993,6
5,Boeing,Liquid_Robotics,[ company1 acquired undersea drone maker co...,0.999993,0
6,RightNow_Technologies,Oracle_Corporation,[ he sold company1 to company2 in 2012 for...,0.999990,0
7,Walmart,Walmart_de_México_y_Centroamérica,[ company1 owned company2 in mexico and th...,0.999989,16
8,Volkswagen_Group,Chevrolet,[ announced it would scrap a $1 billion invest...,0.999987,2
9,Boeing,Pfizer,"[ a group of major exporters, including compa...",0.999982,0


In [24]:
top_results.iloc[15]['company1'], top_results.iloc[15]['company2']

('Morgan_Stanley', 'YouTube')

In [25]:
def return_actual_parent (predicted_subsidiary_name):
    try:
        return (list(train_data[(train_data['company2']==predicted_subsidiary_name)&(train_data['is_parent'])==True]['company1'])[0])
    except:
        return 'No_parent'

In [26]:
count = 0
identified_pairs = []
top_results_fp_removed = []
for index, row in top_results.iterrows():
    actual_parent_name = return_actual_parent(row['company2'])
    if actual_parent_name=='No_parent' or actual_parent_name==row['company1']:
        if [row['company2'],row['company1']] not in identified_pairs:
            identified_pairs.append([row['company1'], row['company2']])
            top_results_fp_removed.append(row)

In [27]:
top_results_fp_removed = pd.DataFrame(top_results_fp_removed)
top_results_fp_removed.index = range(len(top_results_fp_removed))

In [28]:
top_results_fp_removed

Unnamed: 0,company1,company2,snippet,prob,imp_sent_num
0,Danaher_Corporation,Pall_Corporation,[ company2 is being acquired by company1 (...,0.999997,0
1,Alibaba_Group,AutoNavi,[ but when users of the app attempt to use its...,0.999997,2
2,Berkshire_Hathaway,NV_Energy,[ buffett and musk have also competed in other...,0.999995,3
3,UniCredit,HypoVereinsbank,[ commerzbank has around 8 percent of the germ...,0.999995,2
4,Oracle_Corporation,MICROS_Systems,"[ company1 acquired micros in 2014, when it ...",0.999993,6
5,Boeing,Liquid_Robotics,[ company1 acquired undersea drone maker co...,0.999993,0
6,RightNow_Technologies,Oracle_Corporation,[ he sold company1 to company2 in 2012 for...,0.99999,0
7,Walmart,Walmart_de_México_y_Centroamérica,[ company1 owned company2 in mexico and th...,0.999989,16
8,Boeing,Pfizer,"[ a group of major exporters, including compa...",0.999982,0
9,CareFusion,Becton_Dickinson,[\nacquisition of company1 corp. he oversaw ...,0.999916,2


In [29]:
print(top_results_fp_removed[['company1','company2']])

                           company1                           company2
0               Danaher_Corporation                   Pall_Corporation
1                     Alibaba_Group                           AutoNavi
2                Berkshire_Hathaway                          NV_Energy
3                         UniCredit                    HypoVereinsbank
4                Oracle_Corporation                     MICROS_Systems
5                            Boeing                    Liquid_Robotics
6             RightNow_Technologies                 Oracle_Corporation
7                           Walmart  Walmart_de_México_y_Centroamérica
8                            Boeing                             Pfizer
9                        CareFusion                   Becton_Dickinson
10               Unocal_Corporation                Chevron_Corporation
11                     Marathon_Oil                         U.S._Steel
12                        Citigroup                             Boeing
13    

In [30]:
text_num = 35

In [31]:
print('Predicted parent:',top_results_fp_removed['company1'][text_num])
print('Predicted subsidiary:',top_results_fp_removed['company2'][text_num])

Predicted parent: Textron
Predicted subsidiary: Boeing


In [32]:
print('Important Sentence')
print(top_results_fp_removed['snippet'][text_num][top_results_fp_removed['imp_sent_num'][text_num]])

Important Sentence
 instead, uber is building the software to manage networks in the sky of flying taxis, while relying on a stable of manufacturers, including aurora flight sciences, which was acquired by  company2  last month.
uber has also signed up embraer, mooney, bell helicopter - a unit of  company1  -, and pipistrel aircraft to build new vertical takeoff and landing aircraft. ###end###


In [33]:
print('Input snippets:')
for i, snippet in enumerate(top_results_fp_removed['snippet'][text_num]):
    print(i+1, snippet, end='\n\n')

Input snippets:
1 
the osprey is built by  company2  and  company1 's bell helicopter division. ###end###

2 
built by  company2  and bell, a unit of  company1 , the osprey program was nearly scrapped after a history of mechanical failures and two test crashes that killed 23 marines in 2000. ###end###

3 
mitch snyder, president & ceo, bell helicopter,  company1 
kevin mcallister, president and chief executive officer,  company2 , the  company2  co. ###end###

4 -india business council along with about 25 defense firms, including giants  company2 , lockheed martin and  company1 , defense news reports. ###end###

5 -india business council along with about 25 defense firms, including giants  company2 , lockheed martin and  company1 , defense news reports.  company2  will get some extra attention when parrikar stops by a  company2  facility near philadelphia that is churning out ch-47f chinook heavy-lift helicopters that india purchased last year as part of a $3. ###end###

6 
the osprey,