In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# CONFIG
VECTORIZER_PICKLE = 'vectorizer.pkl'
CLUSTERING_PICKLE = 'clustering.pkl'
INDUSTRIES_LIST = '../industries.txt'

labeled_data = '../labeled.csv'

data_types = {
    'text': str,
    'industry': str,
}

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lugassysnir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/lugassysnir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
print('Loading industries')
industries = [x.strip() for x in open(INDUSTRIES_LIST,'r').readlines()]
industries

Loading industries


['accounting',
 'airlines/aviation',
 'alternative dispute resolution',
 'alternative medicine',
 'animation',
 'apparel & fashion',
 'architecture & planning',
 'arts and crafts',
 'automotive',
 'aviation & aerospace',
 'banking',
 'biotechnology',
 'broadcast media',
 'building materials',
 'business supplies and equipment',
 'capital markets',
 'chemicals',
 'civic & social organization',
 'civil engineering',
 'commercial real estate',
 'computer & network security',
 'computer games',
 'computer hardware',
 'computer networking',
 'computer software',
 'construction',
 'consumer electronics',
 'consumer goods',
 'consumer services',
 'cosmetics',
 'dairy',
 'defense & space',
 'design',
 'e-learning',
 'education management',
 'electrical/electronic manufacturing',
 'entertainment',
 'environmental services',
 'events services',
 'executive office',
 'facilities services',
 'farming',
 'financial services',
 'fine art',
 'fishery',
 'food & beverages',
 'food production',
 'fund-

In [3]:
%%time
print('Reading data')
data = pd.read_csv(labeled_data, usecols=['text', 'industry'], dtype=data_types, engine='c')
print('Number of samples texts: ', len(data))

Reading data
Number of samples texts:  1500000
CPU times: user 1min 8s, sys: 15.4 s, total: 1min 23s
Wall time: 6min 47s


In [5]:
data.text.replace(np.nan, "", inplace=True)

In [6]:
vectorizer = pickle.load(open(VECTORIZER_PICKLE, 'rb'))

In [None]:
%%time
means = []
industry_mean = {}
for industry, df in data.groupby('industry'):
    print(industry, df.shape)
    %time X = vectorizer.transform(df.text)
    _mean = X.mean(axis=0)
    means.append(_mean)
    industry_mean[industry] = _mean

means = np.stack([industry_mean[i] for i in industries])

accounting (20252, 2)
CPU times: user 9.35 s, sys: 86.1 ms, total: 9.44 s
Wall time: 9.45 s
airlines/aviation (4315, 2)
CPU times: user 1.9 s, sys: 10.1 ms, total: 1.91 s
Wall time: 1.91 s
alternative dispute resolution (460, 2)
CPU times: user 213 ms, sys: 0 ns, total: 213 ms
Wall time: 212 ms
alternative medicine (2135, 2)
CPU times: user 1.14 s, sys: 0 ns, total: 1.14 s
Wall time: 1.14 s
animation (916, 2)
CPU times: user 328 ms, sys: 3.11 ms, total: 331 ms
Wall time: 330 ms
apparel & fashion (12053, 2)
CPU times: user 5.37 s, sys: 31.7 ms, total: 5.41 s
Wall time: 5.4 s
architecture & planning (14643, 2)
CPU times: user 4.93 s, sys: 32.1 ms, total: 4.96 s
Wall time: 4.96 s
arts and crafts (5203, 2)
CPU times: user 2.37 s, sys: 13.5 ms, total: 2.38 s
Wall time: 2.38 s
automotive (25894, 2)
CPU times: user 15.4 s, sys: 134 ms, total: 15.6 s
Wall time: 15.5 s
aviation & aerospace (4671, 2)
CPU times: user 1.85 s, sys: 0 ns, total: 1.85 s
Wall time: 1.85 s
banking (5874, 2)
CPU times: 

CPU times: user 5.64 s, sys: 19.7 ms, total: 5.66 s
Wall time: 5.66 s
media production (11819, 2)
CPU times: user 5.03 s, sys: 23.8 ms, total: 5.06 s
Wall time: 5.05 s
medical devices (9144, 2)
CPU times: user 4.11 s, sys: 12 ms, total: 4.12 s
Wall time: 4.12 s
medical practice (24628, 2)
CPU times: user 13.7 s, sys: 63.9 ms, total: 13.7 s
Wall time: 13.7 s
mental health care (9822, 2)
CPU times: user 4.65 s, sys: 23.9 ms, total: 4.67 s
Wall time: 4.67 s
military (768, 2)
CPU times: user 366 ms, sys: 17 µs, total: 366 ms
Wall time: 365 ms
mining & metals (5225, 2)
CPU times: user 2.14 s, sys: 20.1 ms, total: 2.16 s
Wall time: 2.16 s
motion pictures and film (3495, 2)
CPU times: user 1.29 s, sys: 0 ns, total: 1.29 s
Wall time: 1.29 s
museums and institutions (3469, 2)
CPU times: user 1.81 s, sys: 8.07 ms, total: 1.81 s
Wall time: 1.81 s
music (10188, 2)
CPU times: user 4.61 s, sys: 11.8 ms, total: 4.62 s
Wall time: 4.62 s
nanotechnology (595, 2)
CPU times: user 271 ms, sys: 117 µs, tota

In [4]:
print('Loading industry mean vector from ind_mean_vec.pkl')
industry_mean = pickle.load(open('ind_mean_vec.pkl', "rb"))
assert set(industries) == set(industry_mean.keys())

Loading industry mean vector from ind_mean_vec.pkl


In [14]:
means = np.array(np.stack([industry_mean[i] for i in industries]))
means.shape

(147, 67092)

In [12]:
%%time
print('Clustering industry means with k=20')
clustering = KMeans(n_clusters=20).fit(means)

Clustering industry means with k=20
CPU times: user 1min 6s, sys: 124 ms, total: 1min 6s
Wall time: 19.3 s


In [13]:
print('Saving clustering to clustering.pkl')
with open(CLUSTERING_PICKLE, "wb") as f:
    pickle.dump(clustering, f)

Saving clustering to clustering.pkl


In [22]:
industry2cluster = {}
for i in industries:
    x = np.array(industry_mean[i])
    industry2cluster[i] = int(clustering.predict(x)) + 1
industry2cluster

{'accounting': 6,
 'airlines/aviation': 18,
 'alternative dispute resolution': 12,
 'alternative medicine': 1,
 'animation': 14,
 'apparel & fashion': 4,
 'architecture & planning': 14,
 'arts and crafts': 4,
 'automotive': 8,
 'aviation & aerospace': 18,
 'banking': 16,
 'biotechnology': 3,
 'broadcast media': 14,
 'building materials': 8,
 'business supplies and equipment': 8,
 'capital markets': 6,
 'chemicals': 8,
 'civic & social organization': 5,
 'civil engineering': 18,
 'commercial real estate': 12,
 'computer & network security': 12,
 'computer games': 4,
 'computer hardware': 12,
 'computer networking': 12,
 'computer software': 12,
 'construction': 8,
 'consumer electronics': 8,
 'consumer goods': 8,
 'consumer services': 8,
 'cosmetics': 8,
 'dairy': 4,
 'defense & space': 18,
 'design': 14,
 'e-learning': 0,
 'education management': 0,
 'electrical/electronic manufacturing': 18,
 'entertainment': 4,
 'environmental services': 8,
 'events services': 4,
 'executive office':

In [37]:
ind2clustre_df = pd.DataFrame(list(industry2cluster.items()), columns=['industry', 'clusterID'])
ind2clustre_df.head(10)

Unnamed: 0,industry,clusterID
0,accounting,6
1,airlines/aviation,18
2,alternative dispute resolution,12
3,alternative medicine,1
4,animation,14
5,apparel & fashion,4
6,architecture & planning,14
7,arts and crafts,4
8,automotive,8
9,aviation & aerospace,18


In [35]:
ind2clustre_df.to_csv('industry2cluster_206312506.csv', index=False)