#####  Text Preprocessing

     Load merged job search results
     load word to vector model (downloaded from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit)
     remove stop words from description field
     for each word in a given description fetch word vector size of 300
     add a new column to the dataframe to keep word vectors
     save pickles
     

In [1]:
import pandas as pd

In [2]:
path = '..\data'
df = pd.read_parquet(path+'\\data_scientist_merged_01_09_2019.parquet')

In [29]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [6]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re


tokenizer1 = ToktokTokenizer()
nltk.download('stopwords')
nltk.download('punkt')
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case=False):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, ", ",text)
    tokens = tokenizer1.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sismc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sismc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
df['desc'] = df.apply(lambda x: remove_stopwords(x['description']),axis=1)

In [13]:
from gensim.models.keyedvectors import KeyedVectors
model = gensim.models.KeyedVectors.load_word2vec_format(path+'\\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [16]:
def getVector(str):
     if str in model:
            return model[str]
     else:
            return None
        
def isInModel(str):
     return str in model

In [31]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(df.desc))
print(tokenizer.document_count,len(tokenizer.word_counts))

list_tokenized_train = tokenizer.texts_to_sequences(df.desc)

2202 16835


In [37]:
maxlen = np.max([len(item)for item in list_tokenized_train])

In [38]:
vocab_size = len(tokenizer.word_index) + 1
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)

In [65]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = getVector(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [71]:
embed_size = 300


###### SImple Model

In [90]:
X_t.shape

(2202, 1049)

In [137]:
from keras.layers import Input, Dense,Conv1D,MaxPooling1D,UpSampling1D,Embedding,LSTM,Flatten
from keras.models import Model


input_ = Input(shape=(X_t[1].shape))
x = Embedding(vocab_size, embed_size, weights=[embedding_matrix], trainable=True)(input_)
x = Flatten()(x)
encoded = Dense(units=128, activation='relu')(x)
encoded = Dense(units=64, activation='relu')(encoded)
encoded = Dense(units=32, activation='relu')(encoded)
decoded = Dense(units=64, activation='relu')(encoded)
decoded = Dense(units=X_t[1].shape[0], activation='sigmoid')(decoded)

In [138]:
embed = Model(input_,x)
embed.summary()

Model: "model_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 1049)              0         
_________________________________________________________________
embedding_15 (Embedding)     (None, 1049, 300)         5050800   
_________________________________________________________________
flatten_8 (Flatten)          (None, 314700)            0         
Total params: 5,050,800
Trainable params: 5,050,800
Non-trainable params: 0
_________________________________________________________________


In [139]:
autoencoder=Model(input_, decoded)
encoder = Model(input_, encoded)
autoencoder.summary()

Model: "model_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 1049)              0         
_________________________________________________________________
embedding_15 (Embedding)     (None, 1049, 300)         5050800   
_________________________________________________________________
flatten_8 (Flatten)          (None, 314700)            0         
_________________________________________________________________
dense_51 (Dense)             (None, 128)               40281728  
_________________________________________________________________
dense_52 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_53 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_54 (Dense)             (None, 64)                211

In [140]:
encoder.summary()

Model: "model_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 1049)              0         
_________________________________________________________________
embedding_15 (Embedding)     (None, 1049, 300)         5050800   
_________________________________________________________________
flatten_8 (Flatten)          (None, 314700)            0         
_________________________________________________________________
dense_51 (Dense)             (None, 128)               40281728  
_________________________________________________________________
dense_52 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_53 (Dense)             (None, 32)                2080      
Total params: 45,342,864
Trainable params: 45,342,864
Non-trainable params: 0
______________________________________________

In [141]:
autoencoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [142]:
autoencoder.fit(X_t, X_t, validation_split=0.2,
                epochs=50,
                batch_size=256)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1761 samples, validate on 441 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50


Epoch 50/50


<keras.callbacks.callbacks.History at 0x277938cd438>

#####  libraries

In [5]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
import numpy as np
import pandas as pd
tokenizer = ToktokTokenizer()
nltk.download('stopwords')
nltk.download('punkt')
stopword_list = nltk.corpus.stopwords.words('english')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sismc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sismc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### load data set

In [None]:
path = '..\data'
df = pd.read_parquet(path+'\\data_scientist_merged_01_09_2019.parquet')

In [None]:
df.head(1)

> Note: Only Job Description field will be processed in this notebook

In [None]:
df_sub = df[['id','description']]

In [None]:
df_sub.shape

In [None]:
df_sub = df_sub.drop_duplicates('id')
df_sub.shape

In [None]:
del df

######  word2vec model

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format(path+'\\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
def remove_stopwords(text, is_lower_case=False):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, ", ",text)
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
# Function to get the embedding vector for n dimension, we have used "300"
def get_embedding(word):
    if word in model.wv.vocab:
        return model[word]
    else:
        return np.zeros(300)

def remove_embed(text):
    filt_text = remove_stopwords(text)
    return np.array([get_embedding(word) for word in filt_text])
    

In [None]:
df_sub['word_vec'] = df_sub.apply(lambda x: remove_embed(x['description']),axis=1)

In [None]:
df_sub['word_vec'].iloc[0].shape

In [None]:
df_sub.shape

In [None]:

m=0
l=50
while m<len(df_sub):
    print(m,l)
    df_sub.iloc[m:l].to_pickle(path+'\\word_encoding\\'+str(l)+'encoded_description.pkl',protocol=2)
    m=m+51
    l=l+51

###### test a pickle

In [None]:
pd.read_pickle("C:\\Users\\sismc\\Desktop\\projects\\documentClusteringDNN\\data\\word_encoding\\1937encoded_description.pkl")