In [2]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from collections import Counter

import pandas as pd
import re, string, itertools
import spacy

In [3]:
# read
news_1 = pd.read_excel('healthcaremarket.xlsx')
news_2 = pd.read_excel('healthcareitnews_add.xlsx')
news_1 = news_1.iloc[447:,:]
news_1 = news_1['text']
news_2 = news_2['text']
news = pd.concat([news_1, news_2])

print(len(news))

2816


In [4]:
# Preprocessing
def pre_process_wordlist(txt):
    txt = txt.strip()
    txt = txt.lower()
    txt = txt.replace('\t', ' ')
    txt = txt.replace('\n', '')
    txt = txt.replace('.', '. ')
    #txt = re.sub('[^a-zA-Z0-9]',' ',txt)
    txt = re.sub('[$]','',txt)
    txt = re.sub('[–]','',txt)
    txt = re.sub('[0-9]',' ',txt)
    txt = txt.replace(u'\xa0', u' ')
    txt = txt.replace('  ', ' ')
    txt = txt.replace('   ', ' ')
    txt = txt.replace('  ', ' ')
    txt = txt.replace(' .', '.')
    txt = re.sub('[-=+,#/\?:“”^"—$€£@*\"※~&%ㆍ!』’\\‘|\(\)\[\]\<\>`\'…》]', '', txt)
    txt = txt.strip()
    # 단어 길이 3개이하 삭제
    # shortword = re.compile(r'\W*\b\w{1,2}\b')
    # txt = shortword.sub('', txt)
    return txt

In [5]:
# extract two words after a specific word
def get_index(doc, cword):
    tokens = word_tokenize(doc)
    words_index = []
    
    for i, j in enumerate(tokens):
        
        if j == cword:
            word_index = [i+1, i+2]
            words_index.append(word_index)
            
        else:
            pass
        
    words = []
    
    for two_index in words_index:
        word = []
        
        for i, j in enumerate(tokens):
            
            if two_index[0] == i:
                word.append(j)
                
            elif two_index[1] == i:
                word.append(j)
                
            else:
                pass
            
        words.append(word)
            
    return words

In [45]:
# count most common word and make dataframe
def counter_df(x):
    counted_words = Counter(x)
    x_counter = [] 
    
    for i, j in counted_words.items():
        x_counter.append(i)
            
    return x_counter

In [7]:
preresults = news.apply(lambda x:pre_process_wordlist(x))

# # System words

In [8]:
# for system
def get_system_words(x):
    get_all = []
    
    for document in x:
        
        sys_words = ['for', 'to']
        get_all_words = []
        
        for i in sys_words:
            get_words = get_index(document, i)
            get_all_words.extend(get_words)
            
        get_all.extend(get_all_words)
        
    return get_all

In [9]:
system_words = get_system_words(preresults)

In [10]:
# extract only verb with stopwords
def stopwords_verb(x):
    stopped_sys = []
    stop = stopwords.words('english')
    stop.extend(['healthcare', 'say', 'be'])
    
    for words in x:
        
        if not words[0] in stop:
            stopped_sys.append(words[0])
            
        else:
            stopped_sys.append(words[1])
    
    pos_list = []
    
    for word in stopped_sys:
        pos_tagging = pos_tag([word])
        
        if pos_tagging[0][1][:2] == 'VB':
            pos_list.append(pos_tagging[0][0])
            
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(i, pos='v') for i in pos_list]
    
    return lemma

In [11]:
stopped_verb = stopwords_verb(system_words)

In [46]:
system_seri = pd.Series(counter_df(stopped_verb))

In [32]:
len(system_seri)

1011

# # Technology

In [14]:
# for technology
def get_tech_words(x):
    get_all = []
    
    for document in x:
        sys_words = ['by', 'with', 'using']
        get_all_words = []
        
        for i in sys_words:
            get_words = get_index(document, i)
            get_all_words.extend(get_words)
            
        get_all.extend(get_all_words)
        
    return get_all

In [15]:
tech_words = get_tech_words(preresults)

In [16]:
# extract only verb with stopwords
def stopwords_noun(x):
    stopped_tech = []
    stop = stopwords.words('english')
    stop.extend(['healthcare', 'say', 'be', 'health', 'company', 'patient', 'others', 'help'])
    
    for words in x:
        
        if not words[0] in stop:
            stopped_tech.append(words[0])
            
        else:
            stopped_tech.append(words[1])
    
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(i, pos='n') for i in stopped_tech]
    
    pos_list = []
    
    for word in lemma:
        pos_tagging = pos_tag([word])
        
        if pos_tagging[0][1][:2] == 'NN':
            pos_list.append(pos_tagging[0][0])
    
    return pos_list

In [17]:
stopped_noun = stopwords_noun(tech_words)

In [47]:
tech_seri = pd.Series(counter_df(stopped_noun))

In [30]:
len(tech_seri)

3444

# # People

In [20]:
# extract only noun with stopwords
def people_noun(x):
    stop = stopwords.words('english')
    is_noun = lambda pos: pos[:2] == 'NN'
    
    noun_list = []
    
    for sentence in x:
        tokens = word_tokenize(sentence)
        noun = [word for (word, pos) in pos_tag(tokens) if is_noun(pos)]
        noun_list.extend(noun)
    
    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(i, pos='n') for i in noun_list]
    
    result = [i for i in lemma if not i in stop]
    
    return result

In [21]:
people_tokens = people_noun(preresults)

In [22]:
# split corpus due to the fact that nlp's limit is 1,000,000
all_corpus = []

for i in range(0, len(people_tokens), 8000):
    corpus = ' '.join(people_tokens[i:i+8000])
    all_corpus.append(corpus)

In [23]:
# NER dictionary in spacy
nlp = spacy.load('en_core_web_lg')

ner_dict = {}

for i in all_corpus:
    doc = nlp(i)
    
    for e in doc:
        
        if e.ent_type_ != "":
            
            if len(e) > 2:
                
                ner_dict[e] = e.ent_type_

In [24]:
person = []
norp = []
org = []
product = []

for i, j in ner_dict.items():
    
    if j == 'PERSON':
        person.append(str(i))
        
    elif j == 'NORP':
        norp.append(str(i))
        
    elif j == 'ORG':
        org.append(str(i))
        
    elif j == 'PRODUCT':
        product.append(str(i))

In [48]:
product_seri = pd.Series(counter_df(product))
norp_seri = pd.Series(counter_df(norp))
org_seri = pd.Series(counter_df(org))

In [49]:
len(product_seri), len(norp_seri), len(org_seri)

(831, 213, 7899)

# # Make final dataframe

In [50]:
print(len(system_seri), len(tech_seri), len(product_seri), len(norp_seri), len(org_seri))

1011 3444 831 213 7899


In [51]:
# Merge all dataframes
morpho_df = pd.concat([product_seri, norp_seri, org_seri, system_seri, tech_seri], axis=1)
morpho_df.columns = ['ppl_product', 'ppl_norp', 'ppl_org', 'system', 'technology']
morpho_df

Unnamed: 0,ppl_product,ppl_norp,ppl_org,system,technology
0,winwin,iam,intelligence,keep,fda
1,apache,stratifi,business,see,ai
2,solr,saudi,pay,disrupt,safety
3,salesforce,babylonian,university,improve,digital
4,cytoflex,thingsiot,exeter,develop,aim
...,...,...,...,...,...
7894,,,leverage,,
7895,,,marr,,
7896,,,keypress,,
7897,,,insideout,,


In [53]:
morpho_df.to_excel('morpho_df.xlsx')