In [None]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

In [None]:
from bert_serving.client import BertClient
bc = BertClient()

In [None]:
import re
import os
import json
import nltk
import gensim
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

STOPWORDS = stopwords.words("english")

def clean_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub(' ', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [None]:
import os
import pandas as pd
from nltk import sent_tokenize
count = 1

def extract_imperatives(reviews): #it takes a list of reviews and returns a list of imperatives
    imperatives = []
    
    for review in reviews:
        
        sentences = sent_tokenize(review)
    
        for sent in sentences: 
            result = nlp.annotate(sent,
                            properties={
                                'annotators': 'pos',
                                'outputFormat': 'json',
                                'timeout': 1000,
                            })
            try:
                if "VB" in result["sentences"][0]["tokens"][0]["pos"]:
                    imperatives.append(sent)
                
            except:
                pass
    return imperatives

In [None]:
def reminder(clean_reviews):
    words = []

    for sentence in clean_reviews:
        if len(sentence) == 0:
            continue
        temp = nltk.word_tokenize(sentence)
        words.extend(temp)
    combined_sentence = [""]
    combined_sentence[0] = " ".join(words)
    words_vec = bc.encode(words)
    sent_vec = bc.encode(combined_sentence)
    similarities = cosine_similarity(sent_vec, words_vec)[0]

    sorted_list = np.argsort(similarities)[::-1]
    words_bert = []

    for k in sorted_list:
        if words[k] in words_bert or len(words_bert) >= 20:
            continue
        words_bert.append(words[k])
        
    words_tfidf = []

    vectorizer = TfidfVectorizer()

    X = vectorizer.fit_transform(clean_reviews)
    X = np.array(X.todense())
    X = np.mean(X, axis = 0)

    X = np.argsort(X)[::-1]

    temp = vectorizer.get_feature_names()

    for i in X:
        words_tfidf.append(temp[i])
    clean_reviews_tokens = []

    for i in clean_reviews:
        clean_reviews_tokens.append(i.split())

    dictionary = gensim.corpora.Dictionary(clean_reviews_tokens)
    bow_corpus = [dictionary.doc2bow(doc) for doc in clean_reviews_tokens]

    lda_model =  gensim.models.LdaMulticore(bow_corpus, num_topics = 10, id2word = dictionary, passes = 10, workers = 2)
    words_lda = []

    for idx, topic in lda_model.print_topics(-1):
        temp = topic.split('"')
        for i in range(len(temp)):
            if i%2 == 0 or temp[i] in words_lda:
                continue
            words_lda.append(temp[i])
    final_list = list(set(words_bert) & set(words_tfidf) & set(words_lda))

    return words_tfidf, words_lda, words_bert

In [None]:
import os
import pandas as pd
count = 1


for file_name in os.listdir('data/Recent'):
    print(count, 'of', len(os.listdir('data/Recent')))
    count += 1
    if file_name in os.listdir('bert_lda_tfidf'):
        continue
    df = pd.read_csv('data/Recent/' + file_name)
    reviews = df.iloc[:, 1].values
    imperatives = extract_imperatives(reviews)
    clean_imperatives = list(map(clean_text, imperatives))
    words_tfidf, words_lda, words_bert = reminder(clean_imperatives)
    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    df3 = pd.DataFrame()
    
    df1['bert'] = words_bert
    df2['tfidf'] = words_tfidf
    df3['lda'] = words_lda

    df1.to_csv('bert_lda_tfidf/'+file_name[:-4]+'_bert.csv', index=False)
    df2.to_csv('bert_lda_tfidf/'+file_name[:-4]+'_lda.csv', index=False)
    df3.to_csv('bert_lda_tfidf/'+file_name[:-4]+'_tfidf.csv', index=False)

In [6]:
import os
import pickle
import pandas as pd
from fuzzywuzzy import process

def calculate_similarity(lst):
    new_lst = []
    for obj in lst:
        try:
            similar_obj, score = process.extract(obj, lst)[1]
        except:
            return lst
        #print(obj, similar_obj, score)
        large = obj
        smaller = similar_obj
        if len(similar_obj) > len(obj):
            large = similar_obj
            smaller = obj
        if score > 80 or smaller in large:
            new_lst.append(smaller)
            continue

        new_lst.append(obj)

    return list(set(new_lst))

In [7]:
total_bert = []
total_lda = []
total_tf_idf = []

all_objects = pd.read_csv('all_objects_clubbed.csv')

for file_name in os.listdir('bert_lda_tfidf'):
    df = pd.read_csv('bert_lda_tfidf/' + file_name)
    objects = list(df.iloc[:, 0].values)
    if len(objects) >= 10:
        objects = objects[:10]
    if file_name.endswith('bert.csv'):
        total_bert += objects
    elif file_name.endswith('lda.csv'):
        total_lda += objects
    elif file_name.endswith('tfidf.csv'):
        total_tf_idf += objects

In [None]:
objects = list(all_objects.iloc[:, 0].values)
labels = list(all_objects.iloc[:, 1].values)

In [None]:
score = 0
for obj in total_bert:
    obj, score= process.extract(obj, objects)[0]
    if score <= 75:
        continue
    if labels[objects.index(obj)] == 1:
        score += 1
    else:
        continue

print("Precision for bert", score/len(total_bert))

In [None]:
score = 0
for obj in total_lda:
    obj, score= process.extract(obj, objects)[0]
    if score <= 75:
        continue
    if labels[objects.index(obj)] == 1:
        score += 1
    else:
        continue

print("Precision for bert", score/len(total_lda))

In [None]:
score = 0
for obj in total_tf_idf:
    obj, score= process.extract(obj, objects)[0]
    if score <= 75:
        continue
    if labels[objects.index(obj)] == 1:
        score += 1
    else:
        continue

print("Precision for bert", score/len(total_tf_idf))

In [8]:
import ast

popular_mentions = []

for file_name in os.listdir('popular_mentions'):
    df = pd.read_csv('popular_mentions/' + file_name)
    objects = df.iloc[0, 1]
    objects = ast.literal_eval(objects)
    popular_mentions += objects[7:]

In [None]:
score = 0
for obj in popular_mentions:
    obj, score= process.extract(obj, objects)[0]
    if labels[objects.index(obj)] == 1:
        score += 1
    else:
        continue

print("Precision for popular_mentions", score/len(popular_mentions))

In [9]:
all_objects = []

for file_name in os.listdir('ner_objects'):
    df = pd.read_csv('ner_objects/' + file_name)
    objects = list(df.iloc[:, 0].values)
    all_objects += objects

In [None]:
score = 0
for obj in total_bert:
    obj, score= process.extract(obj, all_objects)[0]
    if score >= 75:
        score += 1
    else:
        continue

print("recall for bert", score/len(total_bert))

In [10]:
score = 0
count = 1
for obj in total_lda:
    print(count, 'of', len(total_lda))
    count += 1
    obj, score= process.extract(obj, all_objects)[0]
    if score >= 75:
        score += 1
    else:
        continue

print("recall for lda", score/len(total_lda))

 of 3319
1785 of 3319
1786 of 3319
1787 of 3319
1788 of 3319
1789 of 3319
1790 of 3319
1791 of 3319
1792 of 3319
1793 of 3319
1794 of 3319
1795 of 3319
1796 of 3319
1797 of 3319
1798 of 3319
1799 of 3319
1800 of 3319
1801 of 3319
1802 of 3319
1803 of 3319
1804 of 3319
1805 of 3319
1806 of 3319
1807 of 3319
1808 of 3319
1809 of 3319
1810 of 3319
1811 of 3319
1812 of 3319
1813 of 3319
1814 of 3319
1815 of 3319
1816 of 3319
1817 of 3319
1818 of 3319
1819 of 3319
1820 of 3319
1821 of 3319
1822 of 3319
1823 of 3319
1824 of 3319
1825 of 3319
1826 of 3319
1827 of 3319
1828 of 3319
1829 of 3319
1830 of 3319
1831 of 3319
1832 of 3319
1833 of 3319
1834 of 3319
1835 of 3319
1836 of 3319
1837 of 3319
1838 of 3319
1839 of 3319
1840 of 3319
1841 of 3319
1842 of 3319
1843 of 3319
1844 of 3319
1845 of 3319
1846 of 3319
1847 of 3319
1848 of 3319
1849 of 3319
1850 of 3319
1851 of 3319
1852 of 3319
1853 of 3319
1854 of 3319
1855 of 3319
1856 of 3319
1857 of 3319
1858 of 3319
1859 of 3319
1860 of 3319
186

In [11]:
score = 0
count = 1

for obj in total_tf_idf:
    print(count, 'of', len(total_lda))
    count += 1
    obj, score= process.extract(obj, all_objects)[0]
    if score >= 75:
        score += 1
    else:
        continue

print("recall for tf-idf", score/len(total_tf_idf))

 3319
1785 of 3319
1786 of 3319
1787 of 3319
1788 of 3319
1789 of 3319
1790 of 3319
1791 of 3319
1792 of 3319
1793 of 3319
1794 of 3319
1795 of 3319
1796 of 3319
1797 of 3319
1798 of 3319
1799 of 3319
1800 of 3319
1801 of 3319
1802 of 3319
1803 of 3319
1804 of 3319
1805 of 3319
1806 of 3319
1807 of 3319
1808 of 3319
1809 of 3319
1810 of 3319
1811 of 3319
1812 of 3319
1813 of 3319
1814 of 3319
1815 of 3319
1816 of 3319
1817 of 3319
1818 of 3319
1819 of 3319
1820 of 3319
1821 of 3319
1822 of 3319
1823 of 3319
1824 of 3319
1825 of 3319
1826 of 3319
1827 of 3319
1828 of 3319
1829 of 3319
1830 of 3319
1831 of 3319
1832 of 3319
1833 of 3319
1834 of 3319
1835 of 3319
1836 of 3319
1837 of 3319
1838 of 3319
1839 of 3319
1840 of 3319
1841 of 3319
1842 of 3319
1843 of 3319
1844 of 3319
1845 of 3319
1846 of 3319
1847 of 3319
1848 of 3319
1849 of 3319
1850 of 3319
1851 of 3319
1852 of 3319
1853 of 3319
1854 of 3319
1855 of 3319
1856 of 3319
1857 of 3319
1858 of 3319
1859 of 3319
1860 of 3319
1861 o

In [12]:
score = 0
count = 1

for obj in popular_mentions:
    print(count, 'of', len(total_lda))
    count += 1
    obj, score= process.extract(obj, all_objects)[0]
    if score >= 75:
        score += 1
    else:
        continue

print("recall for popular-mentions", score/len(popular_mentions))


10425 of 3319
10426 of 3319
10427 of 3319
10428 of 3319
10429 of 3319
10430 of 3319
10431 of 3319
10432 of 3319
10433 of 3319
10434 of 3319
10435 of 3319
10436 of 3319
10437 of 3319
10438 of 3319
10439 of 3319
10440 of 3319
10441 of 3319
10442 of 3319
10443 of 3319
10444 of 3319
10445 of 3319
10446 of 3319
10447 of 3319
10448 of 3319
10449 of 3319
10450 of 3319
10451 of 3319
10452 of 3319
10453 of 3319
10454 of 3319
10455 of 3319
10456 of 3319
10457 of 3319
10458 of 3319
10459 of 3319
10460 of 3319
10461 of 3319
10462 of 3319
10463 of 3319
10464 of 3319
10465 of 3319
10466 of 3319
10467 of 3319
10468 of 3319
10469 of 3319
10470 of 3319
10471 of 3319
10472 of 3319
10473 of 3319
10474 of 3319
10475 of 3319
10476 of 3319
10477 of 3319
10478 of 3319
10479 of 3319
10480 of 3319
10481 of 3319
10482 of 3319
10483 of 3319
10484 of 3319
10485 of 3319
10486 of 3319
10487 of 3319
10488 of 3319
10489 of 3319
10490 of 3319
10491 of 3319
10492 of 3319
10493 of 3319
10494 of 3319
10495 of 3319
10496