### Load Dataset

In [15]:
import pandas as pd
import numpy as np
import re
import nltk
import string

In [2]:
df = pd.read_csv("google_scholar.csv")
#load  extra stop words
stopwords_df = pd.read_csv('stopwords-en.csv', encoding = "ISO-8859-1", header=0)

In [3]:
df.shape

(52302, 8)

In [4]:
df.head()

Unnamed: 0,Index,Title,Citaions,Authors,Conference,Year,Conference Name,Abstract
0,1,Surveylance: Automatically Detecting Online Su...,0,"A Kharraz, W Robertson, E Kirda","39th S&P 2018:\r\nSan Francisco, CA, USA",2018,S&P,Online surveys are a popular mechanism for per...
1,2,EyeTell: Video-Assisted Touchscreen Keystroke ...,2,"Y Chen, T Li, R Zhang, Y Zhanga","39th S&P 2018:\r\nSan Francisco, CA, USA",2018,S&P,Keystroke inference attacks pose an increasing...
2,3,Understanding Linux Malware,4,"E Cozzi, M Graziano, Y Fratantonioa","39th S&P 2018:\r\nSan Francisco, CA, USA",2018,S&P,"For the past two decades, the security communi..."
3,4,SoK: Keylogging Side Channels,1,J Monaco,"39th S&P 2018:\r\nSan Francisco, CA, USA",2018,S&P,The first keylogging side channel attack was d...
4,5,"FuturesMEX: Secure, Distributed Futures Market...",2,"F Massacci, CN Ngo, J Nie, D Venturia","39th S&P 2018:\r\nSan Francisco, CA, USA",2018,S&P,"In a Futures-Exchange, such as the Chicago Mer..."


In [16]:
#load nltk stopwords
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cookiepoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
extra_stopwords = stopwords_df['stop_words'].tolist()
extra_stopwords.extend(['any','apply','applying','reapplying','given','papers','paper','about','results','result','real','world','page','article','present','takes','account', 'previous','work','propose','proposes','proposed','simply','simple','demonstrate','demonstrated','demonstrates','realworld','datasets','dataset','provide','important','research','researchers','experiments','experiment','unexpected','discovering','using','recent','collected','solve','columns','existing','traditional','final','consider','presented','provides','automatically','extracting','including','help','helps','explore','illustrate','achieve','better'])

In [18]:
# create a whole corpus of stop words
total_stop_words = stop_words + extra_stopwords
print(len(total_stop_words))

566


### Generate Phrases

Reference: https://kavita-ganesan.com/how-to-incorporate-phrases-into-word2vec-a-text-mining-approach/#.YGapXkhKhTY

In [43]:
# phrases to count
# text cleaning 
def phrase_to_counts(phrases):
    """ strip any white space and send back a count of 1"""
    clean_phrases = []

    for p in phrases:
        word = p.strip()

        # we only need to count phrases, so ignore unigrams
        if len(word) > 1 and ' ' in word:
            clean_phrases.append([word, 1])

    return clean_phrases


def remove_special_characters(text):
    """remove characters that are not indicators of phrase boundaries"""
    return re.sub("([{}@\"$%&\\\/*'\"]|\d)", "", text)


In [44]:
# generate candidate phrase
def generate_candidate_phrases(text, stopwords):
    """ generate phrases using phrase boundary markers """

    # generate approximate phrases with punctation
    coarse_candidates = text.lower().split()

    candidate_phrases = []

    for coarse_phrase in coarse_candidates:
        #"\\s+": match sequence of one or more whitespace characters
        words = re.split("\\s+", coarse_phrase)
        previous_stop = False

        # examine each word to determine if it is a phrase boundary marker or
        # part of a phrase or lone ranger
        for w in words:

            if w in stopwords and not previous_stop:
                # phrase boundary encountered, so put a hard indicator
                candidate_phrases.append(";")
                previous_stop = True
            elif w not in stopwords and len(w) > 3:
                # keep adding words to list until a phrase boundary is detected
                candidate_phrases.append(w.strip())
                previous_stop = False

    # get a list of candidate phrases without boundary demarcation
    phrases = re.split(";+", ' '.join(candidate_phrases))

    return phrases

In [45]:
def generate_and_tag_phrases(text, stopwords,min_phrase_count=1):
    """Find top phrases, tag corpora with those top phrases"""
    cleaned_txt = remove_special_characters(text)
    normalized_txt = generate_candidate_phrases(cleaned_txt, stopwords)
    cleaned_phrases = phrase_to_counts(normalized_txt)
    cleaned_phrases.sort(key = lambda x: x[1])
    cleaned_phrases = list(filter(lambda x:x[1] >= min_phrase_count, cleaned_phrases))
    cleaned_phrases.sort(key = lambda x: x[0])
    final_phrases = []
    for phrase in cleaned_phrases: 
      res = phrase[0].replace(" ", "_")  
      #print(res) 
      final_phrases.append(res)
      #print(final_phrases)
    return final_phrases

In [46]:
# create input as list of strings 
df_abs = df[['Abstract']]

In [47]:
df_titles = df[['Title']]

In [48]:
df_titles.head()

Unnamed: 0,Title
0,Surveylance: Automatically Detecting Online Su...
1,EyeTell: Video-Assisted Touchscreen Keystroke ...
2,Understanding Linux Malware
3,SoK: Keylogging Side Channels
4,"FuturesMEX: Secure, Distributed Futures Market..."


In [49]:
def clean_titles(titles):
    clean_titles = []
    for title in titles: 
        if ":" in title:
            title_lst = title.split(":")
            clean_titles.append(title_lst[0])
        else: 
            clean_titles.append(title)
    return clean_titles

In [50]:
titles = df_titles['Title'].tolist()
short_titles = clean_titles(titles)

In [79]:
def generate_uni_titles(short_titles):
    uni_titles = []
    for title in short_titles:
        lst = title.split(" ")
        if len(lst) > 1: 
            uni_title = generate_and_tag_phrases(title, total_stop_words,min_phrase_count=1)
            uni_titles.append(uni_title)
        else: 
            uni_titles.append([title])
    return uni_titles

In [80]:
uni_titles = generate_uni_titles(short_titles)

In [53]:
df_abs.shape

(52302, 1)

In [54]:
df_labels = pd.read_csv('GartnerHypeCycle_EmergingTech.csv', header=0, error_bad_lines=False)

In [55]:
# concatenate labels and abstract
abstract_text = df_abs.rename(columns = {'Abstract':'text'})
label_text = df_labels.rename(columns = {'Technology': 'text'})
df_text = abstract_text.append(label_text)
print(df_text.shape)

(52332, 3)


In [56]:
train_text = df_text['text'].tolist()
print(len(train_text))

52332


In [57]:
# create input train data with duplicate ngrams phrase
abs_lab = []
for doc in train_text: 
    doc_phrases = generate_and_tag_phrases(doc, total_stop_words,min_phrase_count=1)
    abs_lab.append(doc_phrases)

In [100]:
abs_phrases = abs_lab[:52302]
# label_phrases = abs_lab[52302:]

In [103]:
label_phrases = []
for lab in label_text['text']:
    lab_lst = lab.split(" ")
    lab_string = "_".join(lab_lst)
    label_phrases.append(lab_string)

In [68]:
# remove duplicate phrases in each document for abstract
# set the phrases in the first doc as a reference to remove duplicate values
abs_no_dup = []
for doc in abs_phrases[:]:
    unique_phrases = set(doc)
    abs_no_dup.append(list(unique_phrases))

In [73]:
df_ab_no_dup = pd.DataFrame(abs_no_dup)

In [104]:
df_lab = pd.DataFrame(label_phrases)
df_lab.to_csv("label_phrases.csv")

In [74]:
df_ab_no_dup.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,100,101,102
0,survey_scams.,identifying_unique_websites_involved,expose_users,surveylance_works,access_codes,"identity_fraud,_deceptive_advertisements,_pote...",survey_scam_detection,"services,_mapping",large_number,survey_scam_ecosystem,...,,,,,,,,,,
1,soft_keyboard,victims_inputting_process,visually_observe,touchscreen_device,ubiquitous_mobile_devices.,android_devices_confirm,continuous_movements.,"prior_work,_eyetell_requires",human_eyes_naturally_focus,high_efficacy,...,,,,,,,,,,
2,aspect_causes,time_frame,large-scale_measurement_study_conducted,personal_computers.,embedded_devices,mirai_botnet)_mainly,"windows-based_operating_systems._however,",challenges_involved,"network-level_behavior,",fighting_malicious_programs,...,,,,,,,,,,
3,electromagnetic_spike_emanating,"spike,_emitted",idealized_spatial,"style._finally,",substantial_measurement_error.,"then,_keylogging_attacks",channel_attacks,current_state-of-the-art_keylogging,channel_reveals_physical_locations,channel_attack,...,,,,,,,,,,
4,"positions,_absence",abort_absence,honest_majority),exchanges_essentially_non-monotonic_security_b...,desired_currency.,sell_contractual_promises_(futures),concept_implementation,"chicago_mercantile_exchange,_traders",low-frequency_markets,centralized_functionality,...,,,,,,,,,,


In [75]:
df_ab_no_dup.shape

(52302, 103)

In [76]:
df_ab_no_dup_title = df_ab_no_dup.copy()

In [81]:
df_ab_no_dup_title.insert(0, "Title", uni_titles)

In [87]:
df_ab_no_dup_title['Title'] = df_ab_no_dup_title['Title'].astype(str).str.replace('\[|\]|\'', '')

  """Entry point for launching an IPython kernel.


In [88]:
df_ab_no_dup_title.head()

Unnamed: 0,Title,0,1,2,3,4,5,6,7,8,...,93,94,95,96,97,98,99,100,101,102
0,Surveylance,survey_scams.,identifying_unique_websites_involved,expose_users,surveylance_works,access_codes,"identity_fraud,_deceptive_advertisements,_pote...",survey_scam_detection,"services,_mapping",large_number,...,,,,,,,,,,
1,EyeTell,soft_keyboard,victims_inputting_process,visually_observe,touchscreen_device,ubiquitous_mobile_devices.,android_devices_confirm,continuous_movements.,"prior_work,_eyetell_requires",human_eyes_naturally_focus,...,,,,,,,,,,
2,understanding_linux_malware,aspect_causes,time_frame,large-scale_measurement_study_conducted,personal_computers.,embedded_devices,mirai_botnet)_mainly,"windows-based_operating_systems._however,",challenges_involved,"network-level_behavior,",...,,,,,,,,,,
3,SoK,electromagnetic_spike_emanating,"spike,_emitted",idealized_spatial,"style._finally,",substantial_measurement_error.,"then,_keylogging_attacks",channel_attacks,current_state-of-the-art_keylogging,channel_reveals_physical_locations,...,,,,,,,,,,
4,FuturesMEX,"positions,_absence",abort_absence,honest_majority),exchanges_essentially_non-monotonic_security_b...,desired_currency.,sell_contractual_promises_(futures),concept_implementation,"chicago_mercantile_exchange,_traders",low-frequency_markets,...,,,,,,,,,,


### Split Longer Phrases with length more than 3

In [89]:
def remove_underline(df):
    corpus_phrases = []
    for i in range(df.shape[0]):
        doc_phrases = []
        for phrase in df.iloc[i]:
            if str(phrase) != 'nan':
                phrase = str(phrase)
                # print(phrase)
                res = phrase.replace("_", " ")
                regex = re.compile('[^a-zA-Z]')
                res2 = regex.sub(' ', res)
                doc_phrases.append(res2)
            # else:
                # doc_phrases.append(None)
        corpus_phrases.append(doc_phrases)
    return corpus_phrases

In [90]:
# remove NaN
def remove_empty(abstract):
    # remove NaN
    clean_abstract = []
    for doc in abstract:
        clean_doc = []
        for phrase in doc:
            if phrase == " ":
                continue
            clean_doc.append(phrase)
        clean_abstract.append(clean_doc)
    return clean_abstract

In [91]:
# count number of phrases > 3
def detect_longer_phrase(corpus_phrases):
    num_length3 = 0
    num_length_more = 0
    for lst in corpus_phrases: 
        for phrase in lst:
            if len(phrase.split(" "))> 3 or len(phrase.split(","))> 3:
                num_length_more += 1
            elif len(phrase.split(" "))> 3 or len(phrase.split(","))> 3: 
                num_length3 +=1
    print("There are {} phrases with length three.".format(num_length3))
    print("There are {} phrases with lenth more than three.".format(num_length_more))


In [92]:
def get_longer_phrase(corpus_phrases):
    longers = []
    for lst in corpus_phrases: 
        for phrase in lst:
            if len(phrase.split(" "))> 3 or len(phrase.split(","))> 3:
                longers.append(phrase)
            else:
                continue
    return longers

In [93]:
# pos tagger to idenitfy structure of phrase
def identify_structure(phrases_longer, n):
    for phrase in phrases_longer[:n]: 
        tokens = nltk.word_tokenize(phrase)
        print(nltk.pos_tag(tokens))

In [94]:
def clean_longer_phrase(phrases_longer):
    # total_clean_phrases = []
    candidates = []
    tokens = nltk.word_tokenize(phrases_longer)
    tag_tokens = nltk.pos_tag(tokens)
    for tag_token in tag_tokens:
        # at the beginning and it's a verb 
        if tag_token == tag_tokens[0] and re.match(r'VB', tag_token[1]):
            continue
        # at the end and it's a verb or an adverb
        elif tag_token == tag_tokens[len(tag_tokens)-1] and (re.match(r'VB', tag_token[1]) or re.match(r'RB', tag_token[1])):
            continue
        candidates.append(tag_token[0])
    # for those phrases to keep all words, split 4 word phrase into 2 word phrase 
    if "," not in phrases_longer: 
        lst = phrases_longer.split(" ")
    else: 
        lst = phrases_longer.split(",")
    if len(lst) >3:
        tmp1 = " ".join(lst[:3]).replace('e g', "").strip()
        tmp2 = " ".join(lst[3:]).replace('e g', "").strip()
        return tmp1, tmp2
    shorten_tmp = " ".join(candidates[:]).replace('e g', "").strip()
    return None, shorten_tmp

In [95]:
def clean_all_phrase(corpus):
    new_corpus = []
    for doc in corpus:
        new_doc = []
        for phrase in doc:
            # clean all phrases with length more than three
            if len(phrase.split(" "))> 3 or len(phrase.split(","))> 3:
                first_phrase, sec_phrase = clean_longer_phrase(phrase)
                if (not first_phrase) and sec_phrase:
                    new_doc.append(sec_phrase)
                elif first_phrase and sec_phrase:
                    new_doc.append(first_phrase)
                    new_doc.append(sec_phrase)
            # keep all phrases with length less than three
            else:
                new_doc.append(phrase)
        new_corpus.append(new_doc)
    return new_corpus

In [96]:
# transform  phrases back to unigrams
def transform_unigrams(corpus):
    uni_corpus = []
    for doc in corpus: 
        uni_doc = []
        for phrase in doc:
            phrase = phrase.strip()
            res = phrase.replace(" ", "_")
            uni_doc.append(res)
        uni_corpus.append(uni_doc)
    return uni_corpus

In [97]:
def split_longer_phrases(df):
    no_underline_abs_phrases = remove_underline(df)
    abs_phrases_ = remove_empty(no_underline_abs_phrases)
    detect_longer_phrase(abs_phrases_)
    clean_abs_phrases = clean_all_phrase(abs_phrases_)
    uni_phrase_corpus = transform_unigrams(clean_abs_phrases)
    df_abs_phrases = pd.DataFrame(uni_phrase_corpus)
    return df_abs_phrases

In [98]:
df_abs_phrases_notitles = split_longer_phrases(df_ab_no_dup)
df_abs_phrases_notitles.to_csv("bi_tri_phrases_notitles.csv")

There are 0 phrases with length three.
There are 377694 phrases with lenth more than three.


In [99]:
df_abs_phrases_titles = split_longer_phrases(df_ab_no_dup_title)
df_abs_phrases_titles.to_csv("bi_tri_phrases_titles.csv")

There are 0 phrases with length three.
There are 421148 phrases with lenth more than three.
