In [101]:
import re, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.mixture import GaussianMixture
from nltk.tag import pos_tag
from collections import Counter
from matplotlib import cm

import gensim, pickle, math
%matplotlib inline

In [45]:
# Load df
with open('morpho_df.pkl', 'rb') as f:
    df = pickle.load(f)

# # w2v using pretrained google news model

In [3]:
# Load pretrained google news w2v model
model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/user/Desktop/Lab/박민규/text_mining/GoogleNews-vectors-negative300.bin', binary=True)  

In [46]:
# remove duplicates and make list for each column
product = list(set(df['ppl_product'].dropna()))
print(len(product))
norp = list(set(df['ppl_norp'].dropna()))
print(len(norp))
org = list(set(df['ppl_org'].dropna()))
print(len(org))
tech = list(set(df['technology'].dropna()))
print(len(tech))
sys = list(set(df['system'].dropna()))
print(len(sys))

814
209
7848
2873
578


In [29]:
# get 300 dimensions w2v for each list
def get_w2v(x):
    new = []
    get_w2v_list = []
    for i in x:
        try:
            w2v = model[i]
            get_w2v_list.append(w2v)
            new.append(i)
        except KeyError:
            pass
    return get_w2v_list, new

In [47]:
product_w2v, product_new = get_w2v(product)
norp_w2v, norp_new = get_w2v(norp)
org_w2v, org_new = get_w2v(org)
tech_w2v, tech_new = get_w2v(tech)
sys_w2v, sys_new = get_w2v(sys)

# # K-means

In [102]:
# K means, word/Index dictionary
def k_means(x, num_culsters, y):
    # pca_result = PCA_reduction(x)
    kmeans_clustering = KMeans(n_clusters = num_culsters)
    idx = list(kmeans_clustering.fit_predict(x))
    names = y
    word_centroid_map = {names[i]: idx[i] for i in range(len(names))}
    return word_centroid_map

In [89]:
# GMM
def get_gmm(x, y, num_clusters):
    gmm = GaussianMixture(n_components=num_clusters, random_state=0)
    gmm_label = list(gmm.fit(x).predict(x))
    words = y
    word_centroid_map = {words[i]: gmm_label[i] for i in range(len(words))}
    return word_centroid_map

In [154]:
# K-means
product_cluster = k_means(product_w2v, 70, product_new)
norp_cluster = k_means(norp_w2v, 24, norp_new)
org_cluster = k_means(org_w2v, 215, org_new)
tech_cluster = k_means(tech_w2v, 240, tech_new)
sys_cluster = k_means(sys_w2v, 245, sys_new)

In [145]:
# gmm
sys_gmm = get_gmm(sys_w2v, sys_new, 134)

In [68]:
print(len(product_new), len(norp_new), len(org_new), len(tech_new), len(sys_new))

505 81 4031 2172 476


In [155]:
def cluster_df(x, y):
    all_words = []
    for cluster in range(0, y):
        words = []
        for i in range(0,len(list(x.values()))):
            if(list(x.values())[i] == cluster):
                words.append(list(x.keys())[i])
        all_words.append(words)
    number = 0
    for i in all_words:
        if len(i) <= 5 and len(i) >= 2:
            number += 1
    return all_words

In [156]:
final_product = cluster_df(product_cluster, 70)
final_norp = cluster_df(norp_cluster, 24)
final_org = cluster_df(org_cluster, 215)
final_tech = cluster_df(tech_cluster, 240)
final_sys = cluster_df(sys_cluster, 245)

In [158]:
again15_30 = []
again31_40 = []
again40_50 = []
for i in final_product:
    if len(i) <= 15 and len(i) > 1:
        print(i, '\n')
    elif len(i) > 15 and len(i) <= 30:
        again15_30.append(i)
    elif len(i) > 30 and len(i) <= 40:
        again31_40.append(i)
    elif len(i) > 40 and len(i) <= 50:
        again40_50.append(i)
    else:
        pass

['anderson', 'watson', 'mario', 'johnson', 'fda'] 

['analysis', 'study', 'investigation', 'data', 'report', 'review', 'audit'] 

['vulnerability', 'flaw'] 

['partnership', 'collaboration'] 

['trojan', 'spyware', 'virus'] 

['hack', 'hacker', 'hacking'] 

['view', 'viewpoint', 'perspective'] 

['market', 'subsectors', 'sector'] 

['apple', 'orchard'] 

['doctor', 'surgeon', 'radiologist'] 

['analytics', 'optimization'] 

['imaging', 'visualization', 'workflow'] 

['ceo', 'cio', 'svp', 'cfo'] 

['mathematics', 'science'] 

['cardiology', 'radiology'] 

['medicare', 'health', 'heath', 'healthcare'] 

['million', 'rate', 'percent'] 

['enigma', 'task', 'puzzle', 'challenge', 'conundrum'] 

['xml', 'sql', 'ibm', 'api'] 

['terabyte', 'gigabyte'] 

['acuity', 'nurse', 'clinician', 'care', 'hospital', 'patient', 'medicine'] 



In [161]:
print(len(again15_30), len(again31_40), len(again40_50))

3 0 0


In [159]:
def print_others(x, number):
    for i in x:
        again15_w2v, again15_word = get_w2v(i)
        again15_cluster = k_means(again15_w2v, number, again15_word)
        again15_final = cluster_df(again15_cluster, number)
        print('len of clusters:', len(again15_final))
        for i in again15_final:
            if len(i) > 1:
                print(i, '\n')

In [160]:
print_others(again15_30, 3)
print_others(again31_40, 4)
print_others(again40_50, 5)

len of clusters: 3
['ipad', 'samsung'] 

['opensource', 'nvidia', 'iso', 'encryption', 'server', 'java', 'apache', 'google', 'mozilla', 'ftp', 'amazon', 'http', 'vpn'] 

['wifi', 'voip'] 

len of clusters: 3
['lea', 'cole', 'burton', 'sullivan', 'cro', 'anne', 'jane', 'tracy', 'europe', 'boston', 'philip', 'harvard', 'robert', 'john', 'bradley', 'murray'] 

len of clusters: 3
['centricity', 'innovation', 'reshaping', 'initiative', 'growth', 'engagement'] 

['build', 'procurement', 'transformation', 'project', 'integration', 'mobilization', 'deployment', 'development', 'infrastructure', 'discovery'] 



In [153]:
for i in range(180, 250, 5):
    test_cluster = k_means(sys_w2v, i, sys_new)
    # print('N of clusters :', i)
    number = cluster_df(test_cluster, i)
    if number >= 10:
        print('N of 1:', number)
        print('N of clusters :', i)
    # print('Ratio:', np.round(number/i, 3))

N of 1: 43
N of clusters : 180
N of 1: 49
N of clusters : 185
N of 1: 47
N of clusters : 190
N of 1: 50
N of clusters : 195
N of 1: 53
N of clusters : 200
N of 1: 51
N of clusters : 205
N of 1: 50
N of clusters : 210
N of 1: 46
N of clusters : 215
N of 1: 49
N of clusters : 220
N of 1: 50
N of clusters : 225
N of 1: 53
N of clusters : 230
N of 1: 49
N of clusters : 235
N of 1: 52
N of clusters : 240
N of 1: 61
N of clusters : 245


In [11]:
# show words by each cluster
def show_words(x):
    for cluster in range(0,10):
        # cluster number
        print("\nCluster {}".format(cluster))

        # words
        words = []
        for i in range(0,len(list(x.values()))):
            if( list(x.values())[i] == cluster ):
                words.append(list(x.keys())[i])
        print(words)