# M14-Linear Algebra and Text Processing


In [1]:
from nose.tools import assert_almost_equal, assert_true, assert_equal, assert_raises
from numbers import Number

In [2]:

import numpy as np
import locale
import os
import re
import pandas as pd
from textblob import TextBlob

## **Problem 1 (30 points).** Compute the Euclidean norm for each of the following vectors
$$v_1= \begin{bmatrix}5\\-1\\1\\0\\-5\end{bmatrix}$$

In [4]:
norm_v1 = np.linalg.norm(np.array([[5,-1,1,0,-5]]).T)

$$v_2= \begin{bmatrix}2\\ 3\\ 9\\ 4\end{bmatrix}$$

In [5]:
norm_v2 = np.linalg.norm(np.array([[2,3,9,4]]).T)

$$v_3=\begin{bmatrix}-2, -3, -7, -5\end{bmatrix}^T$$

In [6]:
norm_v3 = np.linalg.norm(np.array([[-2,-3,-7,-5]]).T)

$$v_4=\begin{bmatrix}-10,   6,   8,   0,  -8,   7\end{bmatrix}^T$$

In [7]:
norm_v4 = np.linalg.norm(np.array([[-10,6,8,0,-8,7]]).T)

$$v_5=\begin{bmatrix}6,  -5,  -7, -10,  -6,  -2,  -2\end{bmatrix}^T$$

In [8]:
norm_v5 = np.linalg.norm(np.array([[6,-5,-7,-10,-6,-2,-2]]).T)

$$v_6= \begin{bmatrix}4\\  2\\ -8\end{bmatrix}$$

In [9]:
norm_v6 = np.linalg.norm(np.array([[4,2,-8]]).T)

## Problem #2 (10 points)

One of the limitations of word vectors as we have pictured them is [sparsity](https://en.wikipedia.org/wiki/Sparse_array): while our vocabulary is large (tens or hundreds of thousands of words), a typical document (e.g. radiology report) only has tens or hundreds of unique words. Write a class (`sparsev`) that inherits from a `defaultdict` to represent a "sparse" vector. The keys would be the indicies to a vector and the values would be the word counts (how many times that word occurred in the document). The class should have an attribute `self.__dim` that indicates the dimension of the vector space (e.g. the dimension of the vocabulary). The class should have a property `dim` that returns the value in `__dim` the instance of `sparsev` represents. You should define the following methods for the class:

1. `norm`: Accepts as an argument a number `p` (default value=2) and computes the [p-norm](https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm) of the vector.
    1. If `p` is not a number, raise a `ValueError`.
    1. If $p \le 0$, raise a `ValueError`.
1. `cosine_sim`: Accepts as an argument an instance of a `sparsev`.
    1. If the two `sparsev` instances do not have the same `dim` raise a ValueError
    1. If you get a `ZeroDivisionError`, return `np.nan`
    1. Note that the range of `arccos` is -1 to 1. Because of floating point arithmetic issues, you may have arguments that should be equal to 1 but are slightly larger. This will result in `arccos` returning a `np.nan`. In these cases, return a 0 (zero).
1. a `__str__` method that shows the dimension of the vector as well as the elements (key/value pairs).

In [10]:
from collections import defaultdict
from math import *
class sparsev(defaultdict):
    
    def __init__(self, *args, dim=1, **kwargs):
        self.__dim = dim
        
        super(sparsev, self).__init__(*args,**kwargs)
    
    
    @property
    def dim(self):
        return self.__dim
    
    
    def norm(self,p=2):
        if not isinstance(p,int) or isinstance(p,float):
            raise ValueError('p not a number!')
        if p<=0:
            raise ValueError('p should be positive!')
        else:
            return np.linalg.norm(np.array(list(self.values())), ord = p)
        
    def cosine_sim(self, dict2):
        if self.dim != dict2.dim :
            raise ValueError('dictionaries do not have the same dimension')
        else:
            try:
                c_s = sum([self[k]*dict2[k] for k in set(self.keys()) & set(dict2.keys())]) / (np.linalg.norm(np.array(list(self.values())))*np.linalg.norm(np.array(list(dict2.values()))))
                
                if c_s > 1:
                    return 0
                else:
                    return math.acos(c_s)
            except ZeroDivisionError:
                return np.nan
            if c_s > 1:
                return 0
    
    def inner(self, dict2):
        if self.dim != dict2.dim :
            raise ValueError('dictionaries do not have the same dimension')
        
        else:
            INN = sum([self[k]*dict2[k] for k in set(self.keys()) & set(dict2.keys())])
            return INN
    
    def __str__(self):
        return "dim:%s: "%self.__dim + super(sparsev,self).__str__()

In [11]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_raises(ValueError, tmp1.inner, tmp2)

In [12]:
tmp1 = sparsev(int, dim=10)
tmp2 = sparsev(int, dim=10)
tmp1["Brian"] = 3
tmp1["Wendy"] = 4
tmp2["Argos"] = 9
tmp2["Helios"] = 2
tmp1.inner(tmp2)
assert_almost_equal(tmp1.inner(tmp2), 0)

In [13]:
tmp1 = sparsev(int, dim=10)
tmp2 = sparsev(int, dim=10)
tmp1["Brian"] = 3
tmp1["Wendy"] = 4
tmp2["Argos"] = 9
tmp2["Brian"] = 2
tmp1.inner(tmp2)
assert_almost_equal(tmp1.inner(tmp2), 6)


In [14]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_true("5" in tmp1.__str__())

In [15]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_true("3" in tmp2.__str__())

## Test on MIMIC2 radiology reports

In [16]:
import pymysql
import pandas as pd
import getpass
from textblob import TextBlob

In [17]:
conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd="jovyan",db='mimic2')
cursor = conn.cursor()

### Get some documents. Limit the query to keep corpus small for debugging

In [18]:
rad_data = \
pd.read_sql("""SELECT DISTINCT noteevents.subject_id, 
                      noteevents.hadm_id,
                      noteevents.text 
               FROM noteevents
               WHERE noteevents.category = 'RADIOLOGY_REPORT' 
               LIMIT 20000""",conn)
rad_data.head(5)

Unnamed: 0,subject_id,hadm_id,text
0,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:53 AM\n ...
1,56,28766.0,\n\n\n DATE: [**2644-1-17**] 10:43 AM\n ...
2,56,28766.0,\n\n\n DATE: [**2644-1-17**] 6:37 AM\n ...
3,56,28766.0,\n\n\n DATE: [**2644-1-19**] 12:09 PM\n ...
4,37,18052.0,\n\n\n DATE: [**3264-8-14**] 6:06 AM\n ...


## Create a vocabulary

We are first going to replace all digits in the reports with a "d" and convert all characters to lower case. This reduces our vocabulary size by approximately half. We will then use `TextBlob` and sets to get all the unique words in our document. This is our vocabulary. The vocabulary is represented as a dictionary which we create with the `zip` function.

In [19]:
reports = re.sub("\d", "d", " ".join([r.lower() for r in rad_data["text"]]))

words = set(TextBlob(reports).words)

vocabulary = dict(zip(words,range(len(words))))
print(len(vocabulary))

24727


In [20]:
list(vocabulary.items())[:10]

[('described', 0),
 ('ddiv', 12310),
 ('size/contour', 12311),
 ('pvl', 12312),
 ('us-guided', 1),
 ('parchemical', 12313),
 ('amino', 12314),
 ('breakage', 2034),
 ('d.dd-mm', 12315),
 ('glass', 12316)]

## Problem 3 (20 points):

Write a function `doc2vec` that takes two arguments: a `txt` (a text to convert to a vector) and `voc` (the vocabulary). It returns a `sparsev` instance that is the representation of `txt` in the `voc` vector space. Because `txt` may contain words that are not in the vocabulary, you will need to do exception handling.

In [29]:
from collections import defaultdict
import math

def doc2vec(txt, voc):
    txt_d = sparsev(int, dim = len(voc)) 
    words = TextBlob(txt.lower()).words
    for word in words:
        try:
            txt_d[voc[word]]+= 1
        except KeyError:
            pass
    return txt_d  
    

In [30]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.norm(), 66.59579566308972)

In [31]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v157.norm(), 95.40440241414439)

### Cosine similarity of a document with itself should be 1

In [32]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.cosine_sim(v50), 0)

In [33]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.cosine_sim(v157), 0.7430023871608521)

### Create a column in `rad_data` that has values equal to the similarity between the reports and `v50`.

In [34]:
rad_data["50sim"] = rad_data.apply(lambda r: v50.cosine_sim(doc2vec(re.sub("\d", "d", r["text"]), vocabulary)), axis=1)



## Problem 4 (10 points):

Create a DataFrame `rad_data2` that only contains rows where rad_data["50sim"] is greater than zero. Sort the `rad_data2` by ascending values of `50sim`.

In [35]:
rad_data2 = rad_data.loc[rad_data["50sim"] > 0 ]
rad_data2 = rad_data2.sort_values(["50sim"],ascending=True)
rad_data2

Unnamed: 0,subject_id,hadm_id,text,50sim
852,463,,\n\n\n DATE: [**3334-11-8**] 2:43 PM\n ...,0.505251
11410,6659,,\n\n\n DATE: [**3126-7-25**] 7:53 AM\n ...,0.511020
2056,1120,826.0,\n\n\n DATE: [**3022-10-5**] 2:09 AM\n ...,0.518197
11120,6638,,\n\n\n DATE: [**3326-8-19**] 2:24 PM\n ...,0.519177
10868,6464,23593.0,\n\n\n DATE: [**3072-4-20**] 8:50 PM\n ...,0.519620
13919,8192,,\n\n\n DATE: [**3166-9-23**] 10:38 AM\n ...,0.521305
16320,9633,13553.0,\n\n\n DATE: [**3312-3-4**] 7:40 PM\n ...,0.532298
6789,3664,24555.0,\n\n\n DATE: [**2540-3-6**] 1:06 AM\n ...,0.533466
9033,5150,,\n\n\n DATE: [**2781-8-11**] 6:30 PM\n ...,0.537909
19269,11280,5231.0,\n\n\n DATE: [**2694-4-2**] 2:11 PM\n ...,0.539583


In [36]:
assert_equal(rad_data2.iloc[0]["subject_id"], 463)

In [37]:
assert_equal(rad_data2.iloc[-1]["subject_id"], 10315)

## What do the most similar and most dissimilar (relative to 50) reports look like?

In [38]:
print(rad_data.iloc[50]["text"])




     DATE: [**3353-1-26**] 5:37 PM
     CT CHEST W/CONTRAST; CT 150CC NONIONIC CONTRAST                 Clip # [**Clip Number (Radiology) 1672**]
     Reason: please eval new RLL inflitrate seen on CXR and r/o empyema  
     Admitting Diagnosis: NON-HODGKINS LYMPHOMA;FEBRILE;NEUTROPENIA
     Field of view: 34 Contrast: OPTIRAY Amt: 100
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
        55 year old man with enteropathy associated t cell lymphoma and neutropenic.  
       Tachycardic, tachypneic, resp. alkalosis on abg.
     REASON FOR THIS EXAMINATION:
      please eval new RLL inflitrate seen on CXR and r/o empyema                      
     No contraindications for IV contrast
     ______________________________________________________________________________
                                     FINAL REPORT
     INDICATION:  Enteropathy associated T-cell lymphoma and neutropenia,
     tachycardic and tachy

In [39]:
print(rad_data2.iloc[0]["text"])




     DATE: [**3334-11-8**] 2:43 PM
     CT ABD W&W/O C; CT CHEST W/CONTRAST                             Clip # [**Clip Number (Radiology) 3604**]
     CT PELVIS W/CONTRAST; CT 150CC NONIONIC CONTRAST
     Reason: evaluate disease progression
     Field of view: 32 Contrast: OPTIRAY Amt: 150
     ______________________________________________________________________________
     UNDERLYING MEDICAL CONDITION:
      62 year old F with lung cancer. CT chest and abd prior to treatment
     REASON FOR THIS EXAMINATION:
      evaluate disease progression
     ______________________________________________________________________________
                                     FINAL REPORT
     INDICATION:  Lung CA.
     
     TECHNIQUE:  Non-contrast images of the abdomen were obtained.  Contrast
     enhanced images of the abdomen, as well as delayed images of the chest,
     abdomen and pelvis following administration of 150 cc of Optiray were
     performed.  Nonionic IV contrast was used 

In [40]:

print(rad_data2.iloc[-1]["text"])




     DATE: [**2558-2-12**] 1:09 AM
     IVC GRAM/FILTER                                                 Clip # [**Clip Number (Radiology) 3326**]
     Reason: POST SUCTION EMBO
      Contrast: OPTIRAY Amt: 30
     ********************************* CPT Codes ********************************
     * 37620 INTERUP IVC                    36010 INTRO CATH SVC/IVC            *
     * -51 MULTI-PROCEDURE SAME DAY         75940 PERC PLCMT IVC FILTER         *
     * 75825 IVC GRAM                       C1880 VENA CAVA FILTER              *
     ****************************************************************************
     ______________________________________________________________________________
                                     FINAL REPORT
     please see clip [**Clip Number (Radiology) 3321**].

             DR. [**First Name4 (NamePattern1) **] [**Last Name (NamePattern1) **]
             DR. [**First Name11 (Name Pattern1) **] [**Initial (NamePattern1) **]. [**Last Name (NameP