In [None]:
# To install only the requirements of this notebook, uncomment the lines below and run this cell

# ===========================

!pip install scikit-learn==1.0.1
!pip install gensim==3.8.3
!pip install nltk==3.6.5
!pip install matplotlib==3.5.0
!pip install beautifulsoup4==4.9.3
!pip install numpy==1.19.5
!pip install pandas==1.3.4

# ===========================

In [1]:
import string
import pickle
import nltk
from nltk import pos_tag, word_tokenize
from bs4 import BeautifulSoup as bsoup
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor

labeled_data = 'labeled_10000.csv'

In [2]:
import gensim.downloader as api

word2vec = api.load('word2vec-google-news-300')

In [None]:
nltk.download('popular')

## Text Processing Functions

In [None]:
s = '<h1>Our company is focused on making the world a better place for Dogs</h1>'
tokens = word_tokenize(s)
tagged = pos_tag(tokens)
tagged

In [3]:
def clean_html(text):
    return bsoup(text,'html.parser').get_text()

clean_html('<div> hello <a>aasdasd</a> <img src="#"/><h1>TEXT</h1></div>')

' hello aasdasd TEXT'

In [4]:
numeric_table = str.maketrans('', '', '0123456789')

def remove_numeric(text:str):
    return text.translate(numeric_table)
    
remove_numeric(' asdas 123123 kjkl123jl4k23j!@#!@#!K23j1kl23j1k23')

' asdas  kjkljlkj!@#!@#!Kjkljk'

In [5]:
punc_table = str.maketrans('', '', string.punctuation + '©')

def remove_punc(text:str):
    return text.translate(punc_table)
    
remove_punc('kjkl123jl4k23j!@#!@#!K23j1kl23j1k23')

'kjkl123jl4k23jK23j1kl23j1k23'

In [6]:
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

print('denied -> ', get_lemma('denied'))
print('talked -> ', get_lemma('talked'))
print('understood -> ', get_lemma('understood'))
print('<<k1lj23 -> ', get_lemma('<<k1lj23'))

denied ->  deny
talked ->  talk
understood ->  understand
<<k1lj23 ->  <<k1lj23


In [7]:
stopwords = nltk.corpus.stopwords.words('english')

def normalize_text(text):
    text = clean_html(text)
    text = remove_punc(text)
    text = remove_numeric(text)
    tokens = word_tokenize(text)
#    tokens = [get_lemma(t) for t in tokens]
    tokens = [t for t in tokens if t.lower() not in stopwords]
    tokens = [t for t in tokens if len(t) > 1]
    return tokens

## Data Processing

In [None]:
raw = pd.read_csv(labeled_data, usecols=['text', 'industry'])

In [None]:
industries = set(raw.industry)
len(industries)

In [None]:
raw['text_length'] = raw.text.apply(lambda x:len(str(x).split(' ')))

In [None]:
raw.text_length.describe()

In [None]:
import matplotlib.pyplot as plt

plt.hist(raw.text_length, bins=1000)
plt.xlim(0,2000)
plt.title('Phrase Length Distribution (Before normalization)')
plt.xlabel('Number of text')
plt.ylabel('Phrase length')
plt.show()

In [None]:
raw['normalized'] = raw.text.apply(lambda x:normalize_text(str(x)))

In [None]:
raw['normalized'].head()

In [None]:
word_industry = raw.explode('normalized')[['normalized', 'industry']]
word_industry['lower'] = word_industry['normalized'].apply(lambda x:x.lower())
word_industry

In [None]:
pair_count = word_industry[['industry', 'lower']].value_counts()
pair_count[:20]

In [None]:
from collections import defaultdict

word_dist = defaultdict(dict)
word_count = defaultdict(int)

for (ind, w), count in pair_count.iteritems():
    word_dist[w][ind] = count
    word_count[w] += count

w = 'football'
print(f'The frequency of the word "{w}" in each industry')
word_dist[w]

In [None]:
def _score(word:str):
    x = np.array(list(sorted(word_dist[word.lower()].copy().values())))
    if len(x) > 1:
        percentile = 0.90 * 100
        x = x[x>=np.percentile(x, percentile)]
        return max(x) / sum(x)
    return 1

def text_scores(text:list):
    """
    calculated the score for each word in a text
    score - the amount of information we retreive from the word about the industry
    
    trim the lower frequencies and calc max(y) / sum(y)
    """
    score = {}
    for w in text:
        score[w] = _score(w)
    return score

text_scores(raw.normalized[9])

In [None]:
_data = pd.DataFrame(raw['normalized'].copy().explode('normalized'))
_data.rename(columns={'normalized': 'token'}, inplace=True)
_data['token_lower'] = _data['token'].apply(lambda x:str(x).lower())
_data.drop_duplicates(subset=['token_lower'], inplace=True)
_data['score'] = _data['token_lower'].apply(_score)
_data

In [None]:
def w2v(w):
    try:
        v = word2vec[w]
        return v
    except KeyError:
        return np.nan

_data['word_vec'] = _data['token_lower'].apply(w2v)
_data.dropna(subset=['word_vec'], inplace=True)
_data

### Multi-Layer Perceptron Regression

Using scikit-learn MLPRegressor

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
X = np.array(_data.word_vec.tolist())
X.shape

In [None]:
y = _data.score
y.shape

In [None]:
regr = MLPRegressor(hidden_layer_sizes=92, max_iter=300, verbose=True, tol=1e-5, learning_rate='adaptive')

In [None]:
regr.fit(X,y)

In [None]:
with open('model.sklearn', 'wb') as model_file:
    pickle.dump(regr, model_file)

In [None]:
print(y[:5])
print(regr.predict(X[:5]))

In [None]:
sample_text = raw.text[2000]
_x = normalize_text(sample_text)
print("Number of tokens in the text:" , len(_x))
print("Industry = ", raw.industry[2000])
print("Text")
print(raw.text[2000])

In [None]:
_X = pd.DataFrame()
_X['token'] = _x
_X['token_lower'] = _X.token.apply(lambda x:x.lower())
_X.drop_duplicates(subset=['token_lower'], inplace=True)
_X['word_vec'] = _X['token_lower'].apply(w2v)
_X.dropna(subset=['word_vec'], inplace=True)
_X['score'] = _X.word_vec.apply(lambda _v: float(regr.predict(_v.reshape(1,-1))))
_X

In [None]:
_X[['token', 'score']].sort_values(by=['score'], ascending=False).head(10)

In [None]:
word_dist['starlight']

In [8]:
def w2v(w):
    try:
        v = word2vec[w]
        return v
    except KeyError:
        return np.nan

data = pd.read_csv('unlabeled.csv', usecols=['id', 'text'], nrows=10)
data['normalized'] = data.text.apply(lambda x: normalize_text(str(x)))
data = data.explode('normalized')
data.rename(columns={'normalized': 'token'}, inplace=True)
data['token_lower'] = data['token'].apply(lambda x:str(x).lower())
data.drop_duplicates(subset=['id', 'token_lower'], inplace=True)
data['word_vec'] = data['token_lower'].apply(w2v)
data.dropna(subset=['word_vec'], inplace=True)
data.reset_index(drop=True, inplace=True)

with open('model_temp.sklearn', 'rb') as model_file:
    regr = pickle.load(model_file)

data['score'] = data.word_vec.apply(lambda _v: float(regr.predict(_v.reshape(1,-1))))

In [35]:
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,id,text,token,token_lower,word_vec,score
0,1500001,Home-Luxury Move Management 0 Skip to Content ...,Move,move,"[-0.061279297, 0.14355469, -0.2109375, 0.06884...",0.210412
1,1500001,Home-Luxury Move Management 0 Skip to Content ...,Management,management,"[-0.15332031, 0.103515625, -0.08984375, -0.094...",0.080397
2,1500001,Home-Luxury Move Management 0 Skip to Content ...,Skip,skip,"[0.20996094, 0.17578125, 0.13085938, 0.5, -0.0...",0.119760
3,1500001,Home-Luxury Move Management 0 Skip to Content ...,Content,content,"[0.013305664, -0.0009994507, 0.080078125, 0.00...",0.386163
4,1500001,Home-Luxury Move Management 0 Skip to Content ...,Services,services,"[-0.032714844, -0.010803223, 0.08691406, 0.033...",0.159546
...,...,...,...,...,...,...
2192,1500010,Energy Solutions Technology Loading... \r \r H...,Sorry,sorry,"[-0.001876831, 0.20214844, 0.15136719, -0.1650...",0.447735
2193,1500010,Energy Solutions Technology Loading... \r \r H...,enable,enable,"[-0.2578125, 0.026611328, 0.060791016, -0.0947...",0.151823
2194,1500010,Energy Solutions Technology Loading... \r \r H...,JavaScript,javascript,"[-0.067871094, -0.41992188, -0.203125, 0.375, ...",0.199442
2195,1500010,Energy Solutions Technology Loading... \r \r H...,visit,visit,"[-0.12597656, 0.12451172, 0.0035858154, 0.0156...",0.083575


In [39]:
results = data.groupby(['id'])['score'].nlargest(10)
results

id           
1500001  29      0.990873
         78      0.986189
         79      0.922461
         34      0.918327
         33      0.900953
                   ...   
1500010  1910    1.000615
         2122    0.986283
         1904    0.972004
         1951    0.960539
         1984    0.930361
Name: score, Length: 100, dtype: float64

In [49]:
data.iloc[3].token

'Content'

In [51]:
from collections import defaultdict

output = defaultdict(list)

for (key, score) in results.items():
    doc_id = key[0]
    row_id = key[1]
    output[doc_id].append(data.iloc[row_id].token)

output

defaultdict(list,
            {1500001: ['downsizing',
              'Unpacking',
              'PreListing',
              'divorcing',
              'relocating',
              'renovation',
              'Agents',
              'furniture',
              'GTA',
              'Estate'],
             1500002: ['guaranty',
              'Dewar',
              'REITs',
              'Lessors',
              'insureds',
              'Truckers',
              'Corpus',
              'Surety',
              'Casualty',
              'insurer'],
             1500003: ['Valiant',
              'ZA',
              'Finders',
              'ICO',
              'Lawyers',
              'PR',
              'Elections',
              'lawyer',
              'Pierre',
              'Islamic'],
             1500004: ['recurring',
              'USD',
              'refund',
              'original',
              'returned',
              'Buyer',
              'works',
              'vary',
     

In [55]:
[{'id':key, 'snippet':tokens} for key, tokens in output.items()]

[{'id': 1500001,
  'snippet': ['downsizing',
   'Unpacking',
   'PreListing',
   'divorcing',
   'relocating',
   'renovation',
   'Agents',
   'furniture',
   'GTA',
   'Estate']},
 {'id': 1500002,
  'snippet': ['guaranty',
   'Dewar',
   'REITs',
   'Lessors',
   'insureds',
   'Truckers',
   'Corpus',
   'Surety',
   'Casualty',
   'insurer']},
 {'id': 1500003,
  'snippet': ['Valiant',
   'ZA',
   'Finders',
   'ICO',
   'Lawyers',
   'PR',
   'Elections',
   'lawyer',
   'Pierre',
   'Islamic']},
 {'id': 1500004,
  'snippet': ['recurring',
   'USD',
   'refund',
   'original',
   'returned',
   'Buyer',
   'works',
   'vary',
   'monthsLong',
   'conversion']},
 {'id': 1500005,
  'snippet': ['enthusiastically',
   'contracting',
   'Plot',
   'wellrounded',
   'Donald',
   'leavers',
   'installs',
   'Contractors',
   'sq',
   'tutoring']},
 {'id': 1500006,
  'snippet': ['lastminute',
   'Facials',
   'AMB',
   'Brows',
   'Freelancer',
   'Lashes',
   'Weaves',
   'Tanning',
   '

In [53]:
!head unlabeled.csv

id,text,country,region,locality,founded,industry,size
1500001,"Home-Luxury Move Management 0 Skip to Content Our Services Company Our Team Testimonials FAQ Recent Projects Podcasts Real Estate Agents What's For Sale Contact Us 416-937-0499 Open Menu Close Menu Our Services Company Our Team Testimonials FAQ Recent Projects Podcasts Real Estate Agents What's For Sale Contact Us 416-937-0499 Open Menu Close Menu Our Services Folder: Company Back Our Team Testimonials FAQ Recent Projects Podcasts Real Estate Agents What's For Sale Contact Us 416-937-0499  MANAGEMENT SOLUTIONS FOR YOUR MOVELuxury Move Management provides solutions for your move from start to finish. Whether you are downsizing, undergoing a renovation, estate clearing, relocating, divorcing, moving into a condo, a new home or retirement residence, we have the experience to streamline your to-do list and make the entire experience more enjoyable. We currently service the GTA, Thornhill and Oakville.   Book An Appointment   