# Word2Vec Embeddings - Using PySpark and PyArrow (distributed ML)

In [1]:
import numpy as np
import pandas as pd
import csv
import pickle
import time
import math
import collections
from collections import Counter
from collections import defaultdict
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

# !pip install pyspark
# !pip install -U -q PyDrive
# !sudo apt install openjdk-8-jdk-headless -qq
# !pip install pyarrow
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

initialise spark context

In [2]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '35G')
        .set('spark.driver.memory', '35G')
        .set('spark.driver.maxResultSize', '35G'))
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# arrow enabling is what makes the conversion from pandas to spark dataframe really fast
sc._conf.get('spark.driver.memory')
spark

In [5]:
dirPath = '/home/ubuntu/BioMedProject/Data/'
trainCorpusFilename = dirPath + "w2v_training_Corpus_ICD.pkl"
testCorpusFilename = dirPath + "w2v_testing_Corpus_iCD.pkl"

# Step 1: Convert raw data to `CORPUS` (skip if already saved)

In [4]:
# 1. Read in train/test datasets
train = spark.read.load(dirPath + "train",
                     format="csv", sep=",", inferSchema="true", header="true")
test = spark.read.load(dirPath + "test",
                     format="csv", sep=",", inferSchema="true", header="true")

# 2. Filter for columns with ICD Codes and convert to Pandas DF
trainICDCodes = train.select(train.columns[7:32]).toPandas()
testICDCodes = test.select(test.columns[7:32]).toPandas()

# 3. Convert Pandas to a list of lists
trainCorpus = [[elem[:3] for elem in row if type(elem) == str] for row in trainICDCodes.values.tolist()]
testCorpus = [[elem[:3] for elem in row if type(elem) == str] for row in testICDCodes.values.tolist()]

# 4. Remove 'sentences' (or visits) with only 1 ICD code
trainCorpus = [sentence for sentence in trainCorpus if len(sentence) > 1]
testCorpus = [sentence for sentence in testCorpus if len(sentence) > 1]

# 5. Convert list ICD codes to strings
sentence = " "
trainCorpus = [sentence.join(trainCorpus[i]) for i in range(len(trainCorpus))]
testCorpus = [sentence.join(testCorpus[i]) for i in range(len(testCorpus))]

In [6]:
# 6. Save train and test 
with open(trainCorpusFilename, 'wb') as handle:
    pickle.dump(trainCorpus, handle)
    
with open(testCorpusFilename, 'wb') as handle:
    pickle.dump(testCorpus, handle)

# Step 2: Load the train `CORPUS` (which excludes visits with exactly 1 ICD code)

In [7]:
# 1. Load data from pickle file
with open(trainCorpusFilename, 'rb') as handle:
    trainCorpus = pickle.load(handle)
    
# 2. Create Pandas DataFrame
trainDf = pd.DataFrame(trainCorpus, columns = ['sentences'])
del trainCorpus

# 3. Convert Pandas to PySpark
trainSparkDF = spark.createDataFrame(trainDf)

# 4. Visualize sentences
trainSparkDF.show(5)
print("{} million train sentences".format(trainSparkDF.count() / 1e6))

+--------------------+
|           sentences|
+--------------------+
| T83 R82 N40 R33 F17|
|     F10 E11 I10 I63|
|J69 E46 N17 I48 I...|
|             T24 T31|
|             M54 M79|
+--------------------+
only showing top 5 rows

18.283298 million train sentences


# Step 3: Train Word2Vec

In [8]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml import Pipeline

EMBED_LENGTH = 16

# 1. Tokenize sentences
tokenizer = Tokenizer(inputCol="sentences", outputCol="tokens")
tokenized = tokenizer.transform(trainSparkDF).select("tokens")

#2. Initialize Word2Vec Model
w2v = Word2Vec(vectorSize=EMBED_LENGTH, minCount=1, inputCol="tokens", outputCol="features")\
            .setSeed(1234).setWindowSize(25)

In [9]:
# 3. Train Word2Vec Model
start = time.time()
word2vec_model = w2v.fit(tokenized)
end = time.time()
print("Time taken: {} mins".format((end - start)/60))

Time taken: 13.968408679962158 mins


In [10]:
word2vec_model.getVectors().show()

+----+--------------------+
|word|              vector|
+----+--------------------+
| k40|[-0.0542779192328...|
| f25|[-0.2230365127325...|
| k46|[-0.2039844840764...|
| j67|[-0.1282166838645...|
| g63|[0.00770874321460...|
| a28|[-0.0551535189151...|
| i82|[-0.4175004065036...|
| q25|[0.25801816582679...|
| g65|[-0.0121308276429...|
| c55|[-0.6497572064399...|
| k65|[-0.7179201841354...|
| d74|[-0.2180854082107...|
| c71|[-0.0597165934741...|
| p58|[0.15644863247871...|
| n07|[0.16744512319564...|
| h82|[0.44940406084060...|
| k03|[-0.5938100814819...|
| m96|[0.03103973902761...|
| b47|[-0.1402707397937...|
| o46|[0.08599423617124...|
+----+--------------------+
only showing top 20 rows



In [11]:
path = dirPath + "W2V_Models/w2v_ICD_embed_{}.model".format(EMBED_LENGTH)
if os.path.exists(path):
    os.remove(path)
word2vec_model.save(path)

# Step 4 Testing: transform vectors of choice using the trained data

In [None]:
loaded_model = Word2VecModel.load(path)
word2vec_df = loaded_model.getVectors()
word2vec_df.show(3)

In [None]:
test_df = pd.DataFrame(["prg028 prv028 prv028", "prv028"], columns = ['sentences'])
testing_sparkDF =spark.createDataFrame(test_df)
testing_sparkDF = tokenizer.transform(testing_sparkDF).select("tokens")
testing_sparkDF.show(2)

In [None]:
tic = time.time()
res = loaded_model.transform(testing_sparkDF)
print("time taken : {}s".format(time.time() - tic))
res.show(3)

In [None]:
# res.toPandas().head().loc[:,"features"][0]