# Word2Vec Embeddings

In [1]:
import numpy as np
import pandas as pd
import csv
import pickle
import time
import math
import collections
from tqdm import tqdm

In [19]:
!pip install pyspark
!pip install -U -q PyDrive
!sudo apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

openjdk-8-jdk-headless is already the newest version (8u282-b08-0ubuntu1~18.04).
openjdk-8-jdk-headless set to manually installed.
The following packages were automatically installed and are no longer required:
  libaio1 librados2 librbd1
Use 'sudo apt autoremove' to remove them.
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [21]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

initialise spark context

In [22]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [23]:
spark

## Step 1: Read in Data

In [2]:
dirPath = '/home/ubuntu/Biomed-Data-Science-NLP-Project/Data/'
patientData_filepath = dirPath + 'B220_SAA_v1.csv'
CCSR_filepath = dirPath + 'ICD_to_CCSR_20201_1.csv'

In [3]:
def read_csv_to_dict(file_path: str, key: int, value: int):
    ret_dict = {}
    with open(file_path, newline='') as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        for row in data:
            ret_dict[row[key][1:-1]] = row[value][1:-1] # "'icd_code'"
    print("Reading {} complete!".format(file_path))
    return ret_dict

In [4]:
icd_to_ccsr_code = read_csv_to_dict(CCSR_filepath, key=0, value=6) # 'icd_code' -> 'CCSR_code'
ccsr_codes = list(np.unique(list(icd_to_ccsr_code.values())))
ccsr_codes.remove('CCSR CATEGORY 1')

ccsr_code_to_index = collections.defaultdict(int)
for i, ccsr_code in enumerate(ccsr_codes):
    ccsr_code_to_index[ccsr_code] = i
    
print("{} CCSR categories".format(len(ccsr_code_to_index.keys())))

Reading /home/ubuntu/Biomed-Data-Science-NLP-Project/Data/ICD_to_CCSR_20201_1.csv complete!
519 CCSR categories


In [5]:
s1 = time.time()
icd_codes_table = pd.read_csv(patientData_filepath, usecols=range(16,41))
s2 = time.time()
print("Read data in {} minutes".format((s2-s1)/60))
corpus = [[elem for elem in row if type(elem) == str] for row in icd_codes_table.values.tolist()]
# corpus = [[icd1, icd2, icd3], [icd1, icd2],...]
s3 = time.time()
print("Corpus complete in {} minutes".format((s3-s1)/60))

Read data in 2.4394046465555825 minutes
Corpus complete in 6.848367337385813 minutes


In [None]:
def sentenceToCCSR(sentence):
    ccsrSentence = []
    for ICD_code in sentence:
        try:
            ccsr_code = self.icd_to_ccsr[ICD_code]
            ccsrSentence.append(ccsr_code)
        except:
            pass
    return ccsrSentence

In [10]:
df = pd.DataFrame(columns = ['sentences'])
testing = [[1,2],[3,4,5]]
for t in testing:
    df.append(t)

In [26]:
import pandas as pd    
data = [['Scott', 50], ['Jeff', 45], ['Thomas', 54],['Ann',34]] 
# Create the pandas DataFrame 
pandasDF = pd.DataFrame(data, columns = ['Name', 'Age']) 
#Create PySpark DataFrame from Pandas
sparkDF=spark.createDataFrame(pandasDF) 
sparkDF.printSchema()
sparkDF.show()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)

+------+---+
|  Name|Age|
+------+---+
| Scott| 50|
|  Jeff| 45|
|Thomas| 54|
|   Ann| 34|
+------+---+



In [16]:
df = pd.DataFrame([p for p in testing], columns = ['sentences'])

ValueError: 1 columns passed, passed data had 3 columns

In [None]:
# Cycle through each sentence in corpus
for s, inSentence in enumerate(corpus):
    sentence = self.sentenceToCCSR(inSentence) # ICD -> CCSR
    df.

## Train Word2Vec model

In [11]:
# v_count = 502 --> number ofunique words
embed_size=100
model = Word2Vec().setVectorSize(embed_size).setSeed(42).setMinCount(1).fit(doc)