# Morphological Analysis 2 <br>
This 2nd analysis is to minimize number of words in senario table. Because if there are too many words in it, possible senarios are huge. So it's hard to find meaningful senarios for NSD(New Service Developement). <br>
So I have 3 options to minimize number of words. <br>
1. Counter <br>
- Small amounts of counted words can be deleted because it is considered as not that important. <br><br>
2. TF-IDF <br>
- TF-IDF helps to find words that not important in each documents. <br><br>
3. Co-occurrence <br>
- By calculating co-occurrence between each words, centralities for words would be found and can get rid of words which has small centrality. 

In [104]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer


import pandas as pd
import re, string, itertools, pickle
import spacy

## TF-IDF for 'People'

### 1. Preprocessing

In [106]:
# read
news_1 = pd.read_excel(r'C:\Users\AMD3600\git\Lab\healthcaremarket.xlsx')
news_2 = pd.read_excel(r'C:\Users\AMD3600\git\Lab\healthcareitnews_add.xlsx')
news_1 = news_1.iloc[447:,:]
news_1 = news_1['text']
news_2 = news_2['text']
news = pd.concat([news_1, news_2])

In [109]:
# Preprocessing
def pre_process_wordlist(txt):
    txt = txt.strip()
    txt = txt.lower()
    txt = txt.replace('\t', ' ')
    txt = txt.replace('\n', '')
    txt = txt.replace('.', ' ')
    #txt = re.sub('[^a-zA-Z0-9]',' ',txt)
    txt = re.sub('[$]','',txt)
    txt = re.sub('[–]','',txt)
    txt = re.sub('[0-9]',' ',txt)
    txt = txt.replace(u'\xa0', u' ')
    txt = txt.replace('  ', ' ')
    txt = txt.replace('   ', ' ')
    txt = txt.replace('  ', ' ')
    # txt = txt.replace(' .', '.')
    txt = re.sub('[-=+,#/\?:“”^"—$€£@*\"※~&%ㆍ!』’\\‘|\(\)\[\]\<\>`\'…》]', '', txt)
    txt = txt.strip()
    # Delete 1-2 length words
    #shortword = re.compile(r'\W*\b\w{1,2}\b')
    #txt = shortword.sub('', txt)
    return txt

In [110]:
preresults = news.apply(lambda x:pre_process_wordlist(x))

### 2. Noun Lemmatization

In [111]:
# lemmatize noun to make clear to count noun words
def lemm_noun(x):
    lemmatizer = WordNetLemmatizer()
    
    lemm_list = []
    
    tokens = word_tokenize(x)
    pos_words = pos_tag(tokens)
    
    for word, pos in pos_words:
        if pos[:2] == 'NN':
            lemm_list.append(lemmatizer.lemmatize(word, pos='n'))
        else:
            lemm_list.append(word)
    
    result = ' '.join(lemm_list)
    
    return result

In [112]:
noun_results = preresults.apply(lambda x:lemm_noun(x)).tolist()

### 3. Get TF-IDF

In [113]:
def tf_idf_dict(x):
    cv = CountVectorizer()
    noun_bow = cv.fit_transform(x)
    transformer = TfidfTransformer()
    tfidf_matrix = transformer.fit_transform(noun_bow)
    
    # create dictionary to find a tfidf word each word
    word2tfidf = dict(zip(cv.get_feature_names(), transformer.idf_))
        
    return word2tfidf

In [114]:
tfidf_dict = tf_idf_dict(noun_results)

### 4. Find words that indicate 'People'

In [98]:
# extract only noun with stopwords and get people with NER
def people_noun(x):
    stop = stopwords.words('english')
    is_noun = lambda pos: pos[:2] == 'NN'
    
    noun_list = []
    for doc in x:
        tokens = word_tokenize(doc)
        noun = [word for (word, pos) in pos_tag(tokens) if is_noun(pos)]
        noun_list.extend(noun)
    
    result = [i for i in noun_list if not i in stop]

    # split corpus due to the fact that nlp's limit is 1,000,000
    all_corpus = []
    for i in range(0, len(result), 8000):
        corpus = ' '.join(result[i:i+8000])
        all_corpus.append(corpus)
        
    # NER dictionary in spacy
    nlp = spacy.load('en_core_web_lg')

    ner_dict = {}
    for i in all_corpus:
        doc = nlp(i)
        for e in doc:
            if e.ent_type_ != "":
                if len(e) > 2:
                    ner_dict[e] = e.ent_type_
    
    # get words by each NER
    norp = []
    org = []
    product = []
    for i, j in ner_dict.items():
        if j == 'NORP':
            norp.append(str(i))
        elif j == 'ORG':
            org.append(str(i))
        elif j == 'PRODUCT':
            product.append(str(i))
            
    return norp, org, product

In [99]:
norp, org, product = people_noun(noun_results)

In [100]:
len(org)

59112

### 5. Make Dataframe words and tf-idf scores

In [80]:
def make_tfidf_dataframe(dic, filt):
    words = []
    scores = []
    for word, score in dic.items():
        if len(word) > 
        if word in filt:
            words.append(word)
            scores.append(score)
    words = pd.Series(words)
    scores = pd.Series(scores)
    tfidf_df = pd.DataFrame({'words' : words,
                             'tfidf' : scores}).sort_values(by=['tfidf'])
    
    return tfidf_df

In [81]:
tfidf_df = make_tfidf_dataframe(tfidf_dict, org)
with open('people_tfidf_df.pkl', 'wb') as f:
    pickle.dump(tfidf_df, f)

## TF-IDF for 'Tech'

### 1. get 'Tech' words

In [115]:
# extract two words after a specific word
def get_index(doc, cword):
    tokens = word_tokenize(doc)
    words_index = []
    for i, j in enumerate(tokens):
        if j == cword:
            word_index = [i+1, i+2]
            words_index.append(word_index)
        else:
            pass
        
    words = []
    for two_index in words_index:
        word = []
        for i, j in enumerate(tokens):
            if two_index[0] == i:
                word.append(j)
            elif two_index[1] == i:
                word.append(j)
                words.append(word)
            else:
                pass
            
    return words

In [116]:
# for technology
def get_tech_words(x):
    get_all = []
    for document in x:
        sys_words = ['by', 'with', 'using']
        get_all_words = []
        for i in sys_words:
            get_words = get_index(document, i)
            get_all_words.extend(get_words)
        get_all.extend(get_all_words)
    return get_all

In [117]:
noun_results_2 = preresults.apply(lambda x:lemm_noun(x))
tech_words = get_tech_words(noun_results_2)

In [118]:
# extract only verb with stopwords
def stopwords_noun(x):
    stopped_tech = []
    stop = stopwords.words('english')
    stop.extend(['healthcare', 'say', 'be', 'health', 'company', 'patient', 'others', 'help'])
    for words in x:
        if not words[0] in stop:
            stopped_tech.append(words[0])
        else:
            stopped_tech.append(words[1])
            
    pos_list = []
    for word in stopped_tech:
        pos_tagging = pos_tag([word])
        if pos_tagging[0][1][:2] == 'NN':
            pos_list.append(pos_tagging[0][0])
    
    return pos_list

In [119]:
tech_stopped = stopwords_noun(tech_words)

In [120]:
tfidf_df_2 = make_tfidf_dataframe(tfidf_dict, tech_stopped)
with open('tech_tfidf_df.pkl', 'wb') as f:
    pickle.dump(tfidf_df_2, f)