# Word2Vec Embeddings - Using PySpark and PyArrow (distributed ML)

In [2]:
import numpy as np
import pandas as pd
import csv
import pickle
import time
import math
import collections
from collections import Counter
from collections import defaultdict
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

import pyspark
from pyspark.sql import *
from pyspark.sql import functions as F
from pyspark import SparkContext, SparkConf

initialise spark context

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '35G')
        .set('spark.driver.memory', '35G')
        .set('spark.driver.maxResultSize', '35G'))
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.config('spark.sql.shuffle.partitions',300).getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
# arrow enabling is what makes the conversion from pandas to spark dataframe really fast
sc._conf.get('spark.driver.memory')
spark

In [4]:
dirPath = '/home/ubuntu/BioMedProject/Data/'
trainCorpusFilename = dirPath + "w2v_training_Corpus_ICD_combined_visits.pkl"

# Step 1: Convert raw data to `CORPUS` (skip if already saved)

In [10]:
# 1. Read in train dataset
train_df = spark.read.load(dirPath + "train", format="csv", sep=",", inferSchema="true", header="true")
train_df = train_df.select(train_df.colRegex("`[IDx_\dprin]+`")) # Selects ID + ICD Code columns

# 2. Filter for columns with ICD Codes and convert to Pandas DF
trainICDCodes = train_df.select(train_df.columns[1:]).toPandas()

# 3. Convert Pandas to a list of lists
trainCorpus = [[elem[:3] for elem in row if type(elem) == str] for row in trainICDCodes.values.tolist()]

# 4. Create list of patient IDs
patientIDs = train_df.select(train_df.columns[0]).toPandas()
patientIDs = [row[0] for row in patientIDs.values.tolist()]

In [43]:
# 5. Generate dict mapping of patient_id -> all ICD codes across all visits
visits = {}
for idx in range(len(patientIDs)):
    patient_id = patientIDs[idx]
    codes = trainCorpus[idx]
    
    if patient_id in visits:
        old_codes = visits[patient_id]
        codes.extend(old_codes)
    
    visits[patient_id] = codes
    
    if idx % 5e6 == 0:
        print(f"Completed {idx / 1e6} million visits")

print(f"Completed {len(patientIDs) / 1e6} million visits")
assert(len(visits.keys()) == len(np.unique(trainID)))

Completed 0.0 million visits
Completed 5.0 million visits
Completed 10.0 million visits
Completed 15.0 million visits
Completed 20.0 million visits
Completed 22.002111 million visits


In [50]:
# 6. Generate corpus: list of list of codes for each patient
corpus = []
for key in visits.keys():
    corpus.append(visits[key])

# 7. Generate trainCorpus: list of sentences where each sentence is a space-delimited string of icd codes
sentence = " "
trainCorpus = [sentence.join(corpus[i]) for i in range(len(corpus))]

# 8. Save train
with open(trainCorpusFilename, 'wb') as handle:
    pickle.dump(trainCorpus, handle)

# Step 2: Load the train `CORPUS`

In [56]:
# 1. Load data from pickle file
with open(trainCorpusFilename, 'rb') as handle:
    trainCorpus = pickle.load(handle)
    
# 2. Create Pandas DataFrame
trainDf = pd.DataFrame(trainCorpus, columns = ['sentences'])
del trainCorpus

# 3. Convert Pandas to PySpark
trainSparkDF = spark.createDataFrame(trainDf)

# 4. Visualize sentences
trainSparkDF.show(5)
print("{} million train sentences".format(trainSparkDF.count() / 1e6))

+--------------------+
|           sentences|
+--------------------+
|     L30 R21 R50 Z00|
|Z46 R33 N39 F17 R...|
|I87 R60 I10 G40 Z...|
|I50 I12 N18 E78 Z...|
|R51 G89 R07 R51 R...|
+--------------------+
only showing top 5 rows

4.502821 million train sentences


# Step 3: Train Word2Vec

In [72]:
times = []

In [None]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml import Pipeline

EMBED_LENGTHS = [64, 128, 256] # 16, 32


for embed in EMBED_LENGTHS:
    # 1. Tokenize sentences
    tokenizer = Tokenizer(inputCol="sentences", outputCol="tokens")
    tokenized = tokenizer.transform(trainSparkDF).select("tokens")

    #2. Initialize Word2Vec Model
    w2v = Word2Vec(vectorSize=embed, minCount=1, inputCol="tokens", outputCol="features")\
                .setSeed(1234).setWindowSize(25)

    # 3. Train Word2Vec Model
    start = time.time()
    word2vec_model = w2v.fit(tokenized)
    end = time.time()
    
    print(f"Time taken: {(end - start)/60} mins for embed length {embed}")
    times.append((end - start)/60)

    path = dirPath + f"W2V_Models/w2v_ICD_Combined_Visits_embed_{embed}.model"
    if os.path.exists(path):
        os.remove(path)
    word2vec_model.save(path)

In [77]:
times

[47.76017210483551, 70.53435916105906, 125.14494510094325, 236.3829428911209]

# Save Embeddings Tabular form

In [22]:
from pyspark.ml.feature import Word2Vec, Word2VecModel
EMBED_LENGTH = 256
dirPath = '/home/ubuntu/BioMedProject/Data/'
embedFilePath = dirPath + f"W2V_Models/w2v_ICD_Combined_Visits_embed_{EMBED_LENGTH}.model"
loaded_model = Word2VecModel.load(embedFilePath)

In [23]:
df_embed = loaded_model.getVectors().toPandas()
df_embed = df_embed.sort_values("word")
df_embed

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,word,vector
89,041,"[-0.02264483831822872, 0.02762742154300213, -0..."
197,070,"[0.005771160125732422, 0.004969617817550898, 0..."
1432,129,"[0.018249496817588806, 0.048011571168899536, -..."
794,148,"[0.011404353193938732, 0.011679872870445251, -..."
1037,151,"[0.06326667219400406, -0.12140163034200668, 0...."
...,...,...
1528,z95,"[-0.06973341852426529, -0.006060560699552298, ..."
1172,z96,"[0.043614502996206284, 0.05406169965863228, 0...."
676,z97,"[-0.2829343378543854, -0.18949846923351288, -0..."
995,z98,"[-0.18805629014968872, -0.08867216110229492, 0..."


In [24]:
csvFile = open(f"/home/ubuntu/BioMedProject/embed_{EMBED_LENGTH}.csv", "w")

for word, embed in df_embed.values:
    csvFile.write(word)
    for val in embed.toArray().tolist():
        csvFile.write(f",{val}")
    csvFile.write("\n")
csvFile.close()