# M14-Linear Algebra and Text Processing


In [None]:
from nose.tools import assert_almost_equal, assert_true, assert_equal, assert_raises
from numbers import Number

In [None]:

import numpy as np
import locale
import os
import re
import pandas as pd
from textblob import TextBlob

## **Problem 1 (30 points).** Compute the Euclidean norm for each of the following vectors
$$v_1= \begin{bmatrix}5\\-1\\1\\0\\-5\end{bmatrix}$$

In [None]:
norm_v1 = None
# YOUR CODE HERE
raise NotImplementedError()

$$v_2= \begin{bmatrix}2\\ 3\\ 9\\ 4\end{bmatrix}$$

In [None]:
norm_v2 = None
# YOUR CODE HERE
raise NotImplementedError()

$$v_3=\begin{bmatrix}-2, -3, -7, -5\end{bmatrix}^T$$

In [None]:
norm_v3 = None
# YOUR CODE HERE
raise NotImplementedError()

$$v_4=\begin{bmatrix}-10,   6,   8,   0,  -8,   7\end{bmatrix}^T$$

In [None]:
norm_v4 = None
# YOUR CODE HERE
raise NotImplementedError()

$$v_5=\begin{bmatrix}6,  -5,  -7, -10,  -6,  -2,  -2\end{bmatrix}^T$$

In [None]:
norm_v5 = None
# YOUR CODE HERE
raise NotImplementedError()

$$v_6= \begin{bmatrix}4\\  2\\ -8\end{bmatrix}$$

In [None]:
norm_v6 = None
# YOUR CODE HERE
raise NotImplementedError()

## Problem #2 (10 points)

One of the limitations of word vectors as we have pictured them is [sparsity](https://en.wikipedia.org/wiki/Sparse_array): while our vocabulary is large (tens or hundreds of thousands of words), a typical document (e.g. radiology report) only has tens or hundreds of unique words. Write a class (`sparsev`) that inherits from a `defaultdict` to represent a "sparse" vector. The keys would be the indicies to a vector and the values would be the word counts (how many times that word occurred in the document). The class should have an attribute `self.__dim` that indicates the dimension of the vector space (e.g. the dimension of the vocabulary). The class should have a property `dim` that returns the value in `__dim` the instance of `sparsev` represents. You should define the following methods for the class:

1. `norm`: Accepts as an argument a number `p` (default value=2) and computes the [p-norm](https://en.wikipedia.org/wiki/Norm_(mathematics)#p-norm) of the vector.
    1. If `p` is not a number, raise a `ValueError`.
    1. If $p \le 0$, raise a `ValueError`.
1. `cosine_sim`: Accepts as an argument an instance of a `sparsev`.
    1. If the two `sparsev` instances do not have the same `dim` raise a ValueError
    1. If you get a `ZeroDivisionError`, return `np.nan`
    1. Note that the range of `arccos` is -1 to 1. Because of floating point arithmetic issues, you may have arguments that should be equal to 1 but are slightly larger. This will result in `arccos` returning a `np.nan`. In these cases, return a 0 (zero).
1. a `__str__` method that shows the dimension of the vector as well as the elements (key/value pairs).

In [None]:
from collections import defaultdict
class sparsev(defaultdict):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_raises(ValueError, tmp1.inner, tmp2)

In [None]:
tmp1 = sparsev(int, dim=10)
tmp2 = sparsev(int, dim=10)
tmp1["Brian"] = 3
tmp1["Wendy"] = 4
tmp2["Argos"] = 9
tmp2["Helios"] = 2
tmp1.inner(tmp2)
assert_almost_equal(tmp1.inner(tmp2), 0)

In [None]:
tmp1 = sparsev(int, dim=10)
tmp2 = sparsev(int, dim=10)
tmp1["Brian"] = 3
tmp1["Wendy"] = 4
tmp2["Argos"] = 9
tmp2["Brian"] = 2
tmp1.inner(tmp2)
assert_almost_equal(tmp1.inner(tmp2), 6)


In [None]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_true("5" in tmp1.__str__())

In [None]:
tmp1 = sparsev(int, dim=5)
tmp2 = sparsev(int, dim=3)
assert_true("3" in tmp2.__str__())

## Test on MIMIC2 radiology reports

In [None]:
import pymysql
import pandas as pd
import getpass
from textblob import TextBlob

In [None]:
conn = pymysql.connect(host="mysql",
                       port=3306,user="jovyan",
                       passwd="jovyan",db='mimic2')
cursor = conn.cursor()

### Get some documents. Limit the query to keep corpus small for debugging

In [None]:
rad_data = \
pd.read_sql("""SELECT DISTINCT noteevents.subject_id, 
                      noteevents.hadm_id,
                      noteevents.text 
               FROM noteevents
               WHERE noteevents.category = 'RADIOLOGY_REPORT' 
               LIMIT 20000""",conn)
rad_data.head(5)

## Create a vocabulary

We are first going to replace all digits in the reports with a "d" and convert all characters to lower case. This reduces our vocabulary size by approximately half. We will then use `TextBlob` and sets to get all the unique words in our document. This is our vocabulary. The vocabulary is represented as a dictionary which we create with the `zip` function.

In [None]:
reports = re.sub("\d", "d", " ".join([r.lower() for r in rad_data["text"]]))

words = set(TextBlob(reports).words)

vocabulary = dict(zip(words,range(len(words))))
print(len(vocabulary))

In [None]:
list(vocabulary.items())[:10]

## Problem 3 (20 points):

Write a function `doc2vec` that takes two arguments: a `txt` (a text to convert to a vector) and `voc` (the vocabulary). It returns a `sparsev` instance that is the representation of `txt` in the `voc` vector space. Because `txt` may contain words that are not in the vocabulary, you will need to do exception handling.

In [None]:
def doc2vec(txt, voc):
    # YOUR CODE HERE
    raise NotImplementedError()
    

In [None]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.norm(), 66.59579566308972)

In [None]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v157.norm(), 95.40440241414439)

### Cosine similarity of a document with itself should be 1

In [None]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.cosine_sim(v50), 0)

In [None]:
v50 = doc2vec(re.sub("\d", "d", rad_data.loc[50,"text"]), vocabulary)
v157 = doc2vec(re.sub("\d", "d", rad_data.loc[157,"text"]), vocabulary)
assert_almost_equal(v50.cosine_sim(v157), 0.7430023871608521)

### Create a column in `rad_data` that has values equal to the similarity between the reports and `v50`.

In [None]:
rad_data["50sim"] = rad_data.apply(lambda r: v50.cosine_sim(doc2vec(re.sub("\d", "d", r["text"]), vocabulary)), axis=1)

## Problem 4 (10 points):

Create a DataFrame `rad_data2` that only contains rows where rad_data["50sim"] is greater than zero. Sort the `rad_data2` by ascending values of `50sim`.

In [None]:
rad_data2 = None
# YOUR CODE HERE
raise NotImplementedError()
rad_data2.tail()

In [None]:
assert_equal(rad_data2.iloc[0]["subject_id"], 463)

In [None]:
assert_equal(rad_data2.iloc[-1]["subject_id"], 10315)

## What do the most similar and most dissimilar (relative to 50) reports look like?

In [None]:
print(rad_data.iloc[50]["text"])

In [None]:
print(rad_data2.iloc[0]["text"])

In [None]:

print(rad_data2.iloc[-1]["text"])