In [1]:
import pickle
import sys
from datetime import datetime
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
import gensim.downloader as api

from processing import normalize_text_series

import nltk
nltk.download('stopwords')

STOPWORDS = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/snirlugassy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/snirlugassy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/snirlugassy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
labeled_data = '../labeled_100000.csv'

data_types = {
    'id': np.int64,
    'text': str,
    'country': str,
    'region': str,
    'locality': str,
    'founded': np.float,
    'industry': str,
    'size': str
}

print(f'Reading CSV file {labeled_data}')
data = pd.read_csv(labeled_data, dtype=data_types, index_col='id')
data.text.replace(np.nan, "", inplace=True)
data

Reading CSV file ../labeled_100000.csv


Unnamed: 0_level_0,text,country,region,locality,founded,industry,size
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Home | The Bridge DONATE Artboard 1 Student Po...,united states,illinois,orland park,2010.0,civic & social organization,11-50
2,minimax labs | more from less minimax labs Wha...,united kingdom,greater london,london,,computer software,1-10
3,Driscoll Creative – Wendy Driscoll | Graphic D...,united states,kentucky,prospect,,graphic design,1-10
4,The Original Smoothie Bombs™ – The Smoothie Bo...,australia,victoria,port melbourne,2011.0,food & beverages,1-10
5,Becky Beauchine Kulka | Diamonds & Fine Jewelr...,united states,michigan,okemos,1988.0,retail,11-50
...,...,...,...,...,...,...,...
99995,Paralegal Courses Online | Centre for Paralega...,australia,victoria,melbourne,1996.0,legal services,1-10
99996,eSmartKeep – Keep your numbers green About Us ...,united states,florida,miami,2018.0,financial services,11-50
99997,Welcome - Home From Home Kent Comfortable Acco...,united kingdom,medway,rochester,2018.0,hospitality,1-10
99998,Pearl Technology | Home \r Services\r Cybersec...,united states,illinois,peoria heights,1998.0,computer & network security,51-200


In [3]:
industries = data.industry.unique()
num_of_industries = len(industries)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = data.text
vectorizer = TfidfVectorizer(stop_words=STOPWORDS)
X = vectorizer.fit_transform(corpus)

In [28]:
from sklearn.cluster import KMeans, Birch, SpectralClustering

clustering = Birch(n_clusters=20)
X_cluster_dist = clustering.fit_transform(X)

with open("clustering.pkl", "wb") as f:
    pickle.dump(clustering, f)

pred = clustering.predict(X)
data['cluster'] = pred

cluster_ind = data[['cluster', 'industry']].groupby(['industry']).agg(lambda x:x.value_counts().index[0])
cluster_ind = cluster_ind.cluster.to_dict()

_cluster_dist = np.zeros(20)
for i,c in cluster_ind.items():
    _cluster_dist[c] += 1
_cluster_dist /= sum(_cluster_dist)
print('cluster size in %: \n' , _cluster_dist*100)

In [None]:
_cluster_dist

Error: Session cannot generate requests

In [None]:
data['tokens'] = normalize_text_series(data.text)
data['tokens'] = data['tokens'].apply(lambda tokens: [t for t in tokens if t.lower() not in STOPWORDS])
data


In [None]:
data.industry.value_counts()

In [None]:
word_industry = data.explode('tokens')[['tokens', 'industry']]
word_industry.rename(columns={'tokens':'token'}, inplace=True)
word_industry.token = word_industry.token.str.lower()
word_industry.reset_index(inplace=True, drop=True)
word_industry
# word_industry['lower'] = word_industry['text'].apply(lambda x: str(x).lower())
# pair_count = word_industry[['industry', 'lower']].value_counts()

In [None]:
word_dist = defaultdict(dict)
word_count = defaultdict(int)
for (ind, w), count in word_industry.value_counts().iteritems():
    word_dist[w][ind] = count
    word_count[w] += count

In [None]:
word_dist

In [None]:
word_count

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document???.',
    'This document &is the !!!second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer(stop_words=STOPWORDS)
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

In [None]:
X.toarray()

In [None]:
import re
CHAR_FILTER_TABLE = str.maketrans('', '', string.punctuation + '©™–' + '0123456789')
STOPWORDS = nltk.corpus.stopwords.words('english')

WORD_PATTERN = re.compile(r'\w+')
def tokenize(text):
    tokens = WORD_PATTERN.findall(text)
    return [t for t in tokens if (len(t) > 2 or t.lower() not in STOPWORDS)]

def normalize_text_series(text:pd.Series):
    text = text.str.translate(CHAR_FILTER_TABLE).str.lower()
    text = text.apply(tokenize, convert_dtype=False)
    return text

In [None]:
raw['normalized'] = normalize_text_series(raw.text)

In [None]:

print('Normalizing text')
raw['normalized'] = raw.text.apply(lambda x:normalize_text(str(x)), convert_dtype=False)

print('Calculating word distribution over industries')
word_industry = raw.explode('normalized')[['normalized', 'industry']]
word_industry['lower'] = word_industry['normalized'].apply(lambda x:x.lower())
pair_count = word_industry[['industry', 'lower']].value_counts()
word_dist = defaultdict(dict)
word_count = defaultdict(int)
for (ind, w), count in pair_count.iteritems():
    word_dist[w][ind] = count
    word_count[w] += count

word_score = DomainWordScore(word_dist, 90)
_data = pd.DataFrame(raw['normalized'].copy().explode('normalized'))
_data.rename(columns={'normalized': 'token'}, inplace=True)
_data['token_lower'] = _data['token'].apply(lambda x:str(x).lower())
_data.drop_duplicates(subset=['token_lower'], inplace=True)

print('Scoring training data words')
_data['score'] = _data['token_lower'].apply(word_score.score)

print('Embedding word vectors using pre-trained word2vec')
_data['word_vec'] = _data['token_lower'].apply(w2v)

print('Ignoring OOV words')
_data.dropna(subset=['word_vec'], inplace=True)

X = np.array(_data.word_vec.tolist())
y = _data.score

print('X shape = ', X.shape)
print('y shape = ', y.shape)

regr = MLPRegressor(
    verbose=True, 
    hidden_layer_sizes=92, 
    max_iter=300, 
    tol=1e-5, 
    learning_rate='adaptive'
)

print('Training...')
regr.fit(X,y)


print(f'Saving model locally to file: {model_file_name}')
timestamp = str(int(datetime.now().timestamp()))
with open(f'model_{timestamp}.sklearn', 'wb') as model_file:
    pickle.dump(regr, model_file)

print('Finished')

In [None]:
# To install only the requirements of this notebook, uncomment the lines below and run this cell

# ===========================

!pip install scikit-learn==1.0.1
!pip install gensim==3.8.3
!pip install nltk==3.6.5
!pip install matplotlib==3.5.0
!pip install beautifulsoup4==4.9.3
!pip install numpy==1.19.5
!pip install pandas==1.3.4

# ===========================

In [None]:
!ls

In [None]:
import string
import pickle
import nltk
from nltk import pos_tag, word_tokenize
from bs4 import BeautifulSoup as bsoup
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor

labeled_data = 'labeled_350000.csv'

In [None]:
import gensim.downloader as api

word2vec = api.load('word2vec-google-news-300')

In [None]:
nltk.download('popular')

## Text Processing Functions

In [None]:
s = '<h1>Our company is focused on making the world a better place for Dogs</h1>'
tokens = word_tokenize(s)
tagged = pos_tag(tokens)
tagged

In [None]:
def clean_html(text):
    return bsoup(text,'html.parser').get_text()

clean_html('<div> hello <a>aasdasd</a> <img src="#"/><h1>TEXT</h1></div>')

In [None]:
numeric_table = str.maketrans('', '', '0123456789')

def remove_numeric(text:str):
    return text.translate(numeric_table)
    
remove_numeric(' asdas 123123 kjkl123jl4k23j!@#!@#!K23j1kl23j1k23')

In [None]:
punc_table = str.maketrans('', '', string.punctuation + '©')

def remove_punc(text:str):
    return text.translate(punc_table)
    
remove_punc('kjkl123jl4k23j!@#!@#!K23j1kl23j1k23')

In [None]:
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

print('denied -> ', get_lemma('denied'))
print('talked -> ', get_lemma('talked'))
print('understood -> ', get_lemma('understood'))
print('<<k1lj23 -> ', get_lemma('<<k1lj23'))

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

def normalize_text(text):
    text = clean_html(text)
    text = remove_punc(text)
    text = remove_numeric(text)
    tokens = word_tokenize(text)
#    tokens = [get_lemma(t) for t in tokens]
    tokens = [t for t in tokens if t.lower() not in stopwords]
    tokens = [t for t in tokens if len(t) > 1]
    return tokens

## Data Processing

In [None]:
raw = pd.read_csv(labeled_data, usecols=['text', 'industry'])

In [None]:
industries = set(raw.industry)
len(industries)

In [None]:
raw['text_length'] = raw.text.apply(lambda x:len(str(x).split(' ')))

In [None]:
raw.text_length.describe()

In [None]:
import matplotlib.pyplot as plt

plt.hist(raw.text_length, bins=1000)
plt.xlim(0,2000)
plt.title('Phrase Length Distribution (Before normalization)')
plt.xlabel('Number of text')
plt.ylabel('Phrase length')
plt.show()

In [None]:
raw['normalized'] = raw.text.apply(lambda x:normalize_text(str(x)))

In [None]:
raw['normalized'].head()

In [None]:
word_industry = raw.explode('normalized')[['normalized', 'industry']]
word_industry['lower'] = word_industry['normalized'].apply(lambda x:x.lower())
word_industry

In [None]:
pair_count = word_industry[['industry', 'lower']].value_counts()
pair_count[:20]

In [None]:
from collections import defaultdict

word_dist = defaultdict(dict)
word_count = defaultdict(int)

for (ind, w), count in pair_count.iteritems():
    word_dist[w][ind] = count
    word_count[w] += count

w = 'football'
print(f'The frequency of the word "{w}" in each industry')
word_dist[w]

In [None]:
def _score(word:str):
    x = np.array(list(sorted(word_dist[word.lower()].copy().values())))
    if len(x) > 1:
        percentile = 0.90 * 100
        x = x[x>=np.percentile(x, percentile)]
        return max(x) / sum(x)
    return 1

def text_scores(text:list):
    """
    calculated the score for each word in a text
    score - the amount of information we retreive from the word about the industry
    
    trim the lower frequencies and calc max(y) / sum(y)
    """
    score = {}
    for w in text:
        score[w] = _score(w)
    return score

text_scores(raw.normalized[9])

In [None]:
_data = pd.DataFrame(raw['normalized'].copy().explode('normalized'))
_data.rename(columns={'normalized': 'token'}, inplace=True)
_data['token_lower'] = _data['token'].apply(lambda x:str(x).lower())
_data.drop_duplicates(subset=['token_lower'], inplace=True)
_data['score'] = _data['token_lower'].apply(_score)
_data

In [None]:
def w2v(w):
    try:
        v = word2vec[w]
        return v
    except KeyError:
        return np.nan

_data['word_vec'] = _data['token_lower'].apply(w2v)
_data.dropna(subset=['word_vec'], inplace=True)
_data

### Multi-Layer Perceptron Regression

Using scikit-learn MLPRegressor

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
X = np.array(_data.word_vec.tolist())
X.shape

In [None]:
y = _data.score
y.shape

In [None]:
regr = MLPRegressor(hidden_layer_sizes=92, max_iter=300, verbose=True, tol=1e-5, learning_rate='adaptive')

In [None]:
regr.fit(X,y)

In [None]:
with open('model.sklearn', 'wb') as model_file:
    pickle.dump(regr, model_file)

In [None]:
print(y[:5])
print(regr.predict(X[:5]))

In [None]:
sample_text = raw.text[2000]
_x = normalize_text(sample_text)
print("Number of tokens in the text:" , len(_x))
print("Industry = ", raw.industry[2000])
print("Text")
print(raw.text[2000])

In [None]:
_X = pd.DataFrame()
_X['token'] = _x
_X['token_lower'] = _X.token.apply(lambda x:x.lower())
_X.drop_duplicates(subset=['token_lower'], inplace=True)
_X['word_vec'] = _X['token_lower'].apply(w2v)
_X.dropna(subset=['word_vec'], inplace=True)
_X['score'] = _X.word_vec.apply(lambda _v: float(regr.predict(_v.reshape(1,-1))))
_X

In [None]:
_X[['token', 'score']].sort_values(by=['score'], ascending=False).head(10)

In [None]:
word_dist['starlight']

In [None]:
def w2v(w):
    try:
        v = word2vec[w]
        return v
    except KeyError:
        return np.nan

data = pd.read_csv('unlabeled.csv', usecols=['id', 'text'], nrows=10)
data['normalized'] = data.text.apply(lambda x: normalize_text(str(x)))
data = data.explode('normalized')
data.rename(columns={'normalized': 'token'}, inplace=True)
data['token_lower'] = data['token'].apply(lambda x:str(x).lower())
data.drop_duplicates(subset=['id', 'token_lower'], inplace=True)
data['word_vec'] = data['token_lower'].apply(w2v)
data.dropna(subset=['word_vec'], inplace=True)
data.reset_index(drop=True, inplace=True)

with open('model_temp.sklearn', 'rb') as model_file:
    regr = pickle.load(model_file)

data['score'] = data.word_vec.apply(lambda _v: float(regr.predict(_v.reshape(1,-1))))

In [None]:
data.reset_index(drop=True, inplace=True)
data

In [None]:
results = data.groupby(['id'])['score'].nlargest(10)
results

In [None]:
data.iloc[3].token

In [None]:
from collections import defaultdict

output = defaultdict(list)

for (key, score) in results.items():
    doc_id = key[0]
    row_id = key[1]
    output[doc_id].append(data.iloc[row_id].token)

output

In [None]:
[{'id':key, 'snippet':tokens} for key, tokens in output.items()]

In [None]:
!head unlabeled.csv