In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# CONFIG
VECTORIZER_PICKLE = 'vectorizer.pkl'
INDUSTRIES_LIST = '../industries.txt'

labeled_data = '../labeled.csv'

data_types = {
    'text': str,
    'industry': str,
}

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lugassysnir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/lugassysnir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
print('Loading industries')
industries = [x.strip() for x in open(INDUSTRIES_LIST,'r').readlines()]
industries

Loading industries


['accounting',
 'airlines/aviation',
 'alternative dispute resolution',
 'alternative medicine',
 'animation',
 'apparel & fashion',
 'architecture & planning',
 'arts and crafts',
 'automotive',
 'aviation & aerospace',
 'banking',
 'biotechnology',
 'broadcast media',
 'building materials',
 'business supplies and equipment',
 'capital markets',
 'chemicals',
 'civic & social organization',
 'civil engineering',
 'commercial real estate',
 'computer & network security',
 'computer games',
 'computer hardware',
 'computer networking',
 'computer software',
 'construction',
 'consumer electronics',
 'consumer goods',
 'consumer services',
 'cosmetics',
 'dairy',
 'defense & space',
 'design',
 'e-learning',
 'education management',
 'electrical/electronic manufacturing',
 'entertainment',
 'environmental services',
 'events services',
 'executive office',
 'facilities services',
 'farming',
 'financial services',
 'fine art',
 'fishery',
 'food & beverages',
 'food production',
 'fund-

In [3]:
print('Reading data')
data = pd.read_csv(labeled_data, usecols=['text', 'industry'], dtype=data_types, engine='c')
print('Number of samples texts: ', len(data))

Reading data
Number of samples texts:  1500000


In [4]:
print('Normalizing data')
data.text.replace(np.nan, "", inplace=True)

Normalizing data


In [5]:
print('Training TF-IDF Vectorizer')
corpus = data.text
vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w\w\w+\b', strip_accents='ascii', max_df=0.7, min_df=1e-4)
vectorizer.fit(corpus)

Training TF-IDF Vectorizer


TfidfVectorizer(max_df=0.7, min_df=0.0001, strip_accents='ascii',
                token_pattern='(?u)\\b\\w\\w\\w+\\b')

In [6]:
print(f'Saving vectorizer to {VECTORIZER_PICKLE}')
with open(VECTORIZER_PICKLE, "wb") as f:
    pickle.dump(vectorizer, f)

Saving vectorizer to vectorizer.pkl


In [7]:
len(vectorizer.get_feature_names_out())

67092

In [None]:
%%time
means = []
industry_mean = {}
for industry, df in data.groupby('industry'):
    print(industry, df.shape)
    %time X = vectorizer.transform(df.text)
    _mean = X.mean(axis=0)
    means.append(_mean)
    industry_mean[industry] = _mean
means = np.stack([industry_mean[i] for i in industries])

In [None]:
print('Saving industry mean vector to ind_mean_vec.pkl')
with open('ind_mean_vec.pkl', "wb") as f:
    pickle.dump(industry_mean, f)