In [1]:

import pickle
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

CHUNK_SIZE = 50000
STOPWORDS = nltk.corpus.stopwords.words('english')
VECTORIZER_PICKLE = 'vectorizer.pkl'

labeled_data = '../labeled.csv'

data_types = {
    # 'id': np.int64,
    'text': str,
    # 'country': str,
    # 'region': str,
    # 'locality': str,
    # 'founded': np.float,
    'industry': str,
    # 'size': str
}


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/snirlugassy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

# for chunk in pd.read_csv(labeled_data, usecols=['text', 'industry'] ,dtype=data_types, chunksize=CHUNK_SIZE):
#     chunk.text.replace(np.nan, "", inplace=True)
#     print('Normalizing text...')
#     chunk['normalized'] = normalize_text_series(chunk.text)

#     print('Calculating word distribution over industries')
#     word_industry = chunk.explode('normalized')[['normalized', 'industry']]
#     word_industry['lower'] = word_industry['normalized'].apply(lambda x: str(x).lower())

print(f'Reading CSV file {labeled_data}')
data = pd.read_csv(labeled_data, usecols=['text', 'industry'] ,dtype=data_types, keep_default_na=False).sample(frac=0.5)

print('Replacing NaN text')
data.text.replace(np.nan, "", inplace=True)
data.info(memory_usage='deep')

Reading CSV file ../labeled.csv
Replacing NaN text
<class 'pandas.core.frame.DataFrame'>
Int64Index: 750000 entries, 1152299 to 554424
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      750000 non-null  object
 1   industry  750000 non-null  object
dtypes: object(2)
memory usage: 6.2 GB


In [3]:
industries = data.industry.unique()
num_of_industries = len(industries)
print(f'Processing data for {num_of_industries} industries')

Processing data for 147 industries


In [4]:

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = data.text

# vectorizer = TfidfVectorizer(
#     stop_words=STOPWORDS, 
#     strip_accents='ascii',
#     sublinear_tf=True,
#     smooth_idf=False
# )

vectorizer = TfidfVectorizer(
    max_df=0.7,
    strip_accents='ascii',
)

print('Training TF-IDF Vectorizer')
X = vectorizer.fit_transform(corpus)

print('Saving vectorizer to vectorizer.pkl')
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

Training TF-IDF Vectorizer


In [23]:
from sklearn.cluster import KMeans, Birch, MiniBatchKMeans

clustering = MiniBatchKMeans(
    n_clusters=20,
    batch_size=int(data.shape[0] / 10)    
)

y = clustering.fit_predict(X)
data['cluster'] = y

In [24]:
print('Calculating cluster majority per industry')
cluster_ind = data[['cluster', 'industry']].groupby(['industry']).agg(lambda x:x.value_counts().index[0])
cluster_ind = cluster_ind.cluster.to_dict()
print(cluster_ind)

_cluster_dist = np.zeros(20)
for i,c in cluster_ind.items():
    _cluster_dist[c] += 1
_cluster_dist /= sum(_cluster_dist)
print('cluster size in %: \n' , _cluster_dist*100)

Calculating cluster majority per industry
{'accounting': 2, 'airlines/aviation': 4, 'alternative dispute resolution': 4, 'alternative medicine': 8, 'animation': 4, 'apparel & fashion': 15, 'architecture & planning': 11, 'arts and crafts': 15, 'automotive': 13, 'aviation & aerospace': 4, 'banking': 14, 'biotechnology': 4, 'broadcast media': 4, 'building materials': 4, 'business supplies and equipment': 4, 'capital markets': 4, 'chemicals': 4, 'civic & social organization': 1, 'civil engineering': 11, 'commercial real estate': 0, 'computer & network security': 3, 'computer games': 4, 'computer hardware': 4, 'computer networking': 3, 'computer software': 3, 'construction': 11, 'consumer electronics': 4, 'consumer goods': 15, 'consumer services': 4, 'cosmetics': 15, 'dairy': 4, 'defense & space': 4, 'design': 4, 'e-learning': 14, 'education management': 5, 'electrical/electronic manufacturing': 4, 'entertainment': 4, 'environmental services': 4, 'events services': 1, 'executive office': 4,