# Text 1: Vector space models
**Internet Analytics - Lab 4**

---

**Group:** R
**Names:**

* Raphael Strebel
* Raphaël Barman
* Thierry Bossy

---

#### Instructions

*This is a template for part 1 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [27]:
import pickle
import numpy as np
from scipy.sparse import csr_matrix
from utils import load_json, load_pkl
import string
import re
from operator import itemgetter
import nltk
import math

from bokeh.plotting import figure, output_notebook,show, ColumnDataSource
from bokeh.models.widgets import DataTable, DateFormatter, TableColumn
from bokeh.layouts import widgetbox
from IPython.core.display import HTML
HTML("""
<style>
.bk-root .bk-slick-header-column.bk-ui-state-default {
height: 25px!important;
}
</style>
""")


from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

output_notebook()

courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

## Exercise 4.1: Pre-processing

In [41]:
id2name = dict(map(itemgetter('courseId', 'name'),courses))
name2id = {v: k for k,v in id2name.items()}
np.save('id2name', id2name)
np.save('name2id', name2id)

In [3]:
lmtzr = WordNetLemmatizer()
stemmer = PorterStemmer()

# Add a word to the bag of word given as parameter
def add2bow(word, bow):
    #newbow = bow.copy()
    if word not in bow:
        bow[word] = 0
    bow[word] += 1
    return bow

# Merges to bag of words
def mergeBow(bow1, bow2):
    #newbow = bow1.copy()
    for word, occ in bow2.items():
        if word not in bow1:
            bow1[word] = 0
        bow1[word] += occ
    return bow1

# Returns the bag of words of a text as a dictionary, so the different words as keys and their number of occurence as value
def getBagOfWords(text):
    bow = {}
    text = text.replace('\xa0', ' ')
    # Remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    text = text.split(' ')
    for idx in range(len(text)):
        word = text[idx]
        # separate words such that "MyNameIsChristian" becomes "My" "Name" "Is" "Christian"
        res = re.findall('[a-zA-Z][^A-Z]*',word)
        if res:
            if len(min(res,key=len)) != 1:
                if len(res) > 0:
                    text[idx] = ''
                for match in res:
                    text.append(match)
    text = [x for x in text if x != '']
    for idx in range(len(text)):
        word = text[idx]
        # Keep words that are only upper case as such (we don't want IT to become it) and put all others as lower case
        if not word.isupper():
            word = word.lower()
        # Lemmatize all non-digit words 
        if not word.isdigit() and not word in stopwords:
            bow = add2bow(lmtzr.lemmatize(word),bow)
            #add2bow(stemmer.stem(word),bow)
#     for word in textCopy:
#         if not word.isupper():
#             word = word.lower()
#         if not word.isdigit():
#             text.add(lmtzr.lemmatize(word))
#     for word in stopwords:
#         try:
#             text.remove(word)
#         except KeyError:
#             continue
    return bow


# Compute bag of words for the description of every course, then merge them and return the global bag of words
def getGlobalBagOfWords():
    globalBagOfWords = {}
    bagOfWords = {}
    for course in courses:
        #localBow = {}
        localBow = getBagOfWords(course['description'])
        bagOfWords[course['courseId']] = localBow
        localBow = mergeBow(globalBagOfWords,localBow)
    occurences = sorted(globalBagOfWords.items(), key=itemgetter(1))
    # We remove all words with occurences < minBound and > maxBound
    minBound = occurences[9][1]
    maxBound = occurences[-9][1]
    globalBagOfWords = {k: v for k,v in globalBagOfWords.items() if v > minBound and v < maxBound}
    for course in bagOfWords.keys():
        bagOfWords[course] = {k: v for k,v in bagOfWords[course].items() if k in globalBagOfWords}
    return globalBagOfWords, bagOfWords

In [4]:
globalBagOfWords = {}
globalBagOfWords, bagOfWords = getGlobalBagOfWords()
print(sum(globalBagOfWords.values()))
print(len(globalBagOfWords.keys()))
#getBagOfWords(courses[1]['description'])
#for course in courses:
#    bow = getBagOfWords(course['description'])
#    bagOfWords[course['courseId']] = bow
#    mergeBow(globalBagOfWord,bow)
#test_course = course.copy()

375351
6947


In [5]:
sortedWords = sorted(bagOfWords[name2id['Internet analytics']].items(), key=itemgetter(0))
print(sortedWords)

[('COM300', 1), ('acquired', 1), ('activity', 1), ('ad', 2), ('advertisement', 1), ('algebra', 2), ('algorithm', 2), ('analytics', 2), ('analyze', 1), ('application', 2), ('assessment', 1), ('auction', 2), ('balance', 1), ('based', 2), ('basic', 3), ('cathedra', 1), ('chain', 1), ('class', 3), ('cloud', 1), ('clustering', 2), ('collection', 1), ('combination', 1), ('communication', 1), ('community', 2), ('computing', 2), ('concept', 2), ('concrete', 1), ('coverage', 1), ('current', 1), ('data', 6), ('datasets', 2), ('decade', 1), ('dedicated', 1), ('designed', 1), ('detection', 2), ('develop', 1), ('dimensionality', 1), ('draw', 1), ('ecommerce', 2), ('effectiveness', 1), ('efficiency', 1), ('end', 1), ('exam', 1), ('expected', 1), ('explore', 4), ('explores', 1), ('field', 1), ('final', 1), ('foundational', 1), ('framework', 1), ('function', 1), ('fundamental', 1), ('good', 1), ('graph', 2), ('hadoop', 2), ('handson', 1), ('homework', 2), ('important', 1), ('information', 2), ('infras

We chose to remove all punctuation and all stopwords since there really is no interest in keeping them.
We also lemmatize the words using the nltk library, to keep track of similar words and have a more accurate word occurence count. 

In [6]:
test_course = courses[1]
bagOfWords = {}
#globalBagOfWord = {}
#getBagOfWords(courses[1]['description'])
print(test_course)
bow = getBagOfWords(test_course['description'])
bagOfWords[test_course['courseId']] = bow
#mergeBow(globalBagOfWord,bow)
print(bow)
dict(sorted(bagOfWords.items(), key=itemgetter(1), reverse=True)[:5])


{'name': 'Image Processing for Life Science', 'courseId': 'BIO-695', 'description': 'This course intends to teach image processing with a strong emphasis of applications in life sciences. The idea is to enable the participants to solve image processing questions via workflows independently. Content Over the last decades, the images arising from microscopes in Life Sciences went from being a qualitative support of scientific evidence to a quantitative resource. To obtain good quality data from digital images, be it from a photograph of a Western blot, a TEM slice or a multi-channel confocal time-lapse stack, scientists must understand the underlying processes leading to the extracted information. Of similar importance is the software used to obtain the data. This course makes use of the ImageJ (FIJI package) as well as other open-source tools to ensure maximum reproducibility and protocol transfer of the analysis pipelines. The course will span 14 weeks with 1h30 of lecture per week, as

{'BIO-695': {'FIJI': 2,
  'TEM': 1,
  'aim': 1,
  'analysis': 3,
  'application': 1,
  'arising': 1,
  'assessment': 1,
  'autonomous': 1,
  'autonomously': 1,
  'biology': 1,
  'blot': 1,
  'complete': 1,
  'completed': 1,
  'confocal': 1,
  'content': 1,
  'context': 1,
  'continuous': 1,
  'cover': 1,
  'creation': 2,
  'data': 5,
  'decade': 1,
  'deconvolution': 1,
  'defined': 1,
  'denoising': 1,
  'digital': 3,
  'emphasis': 2,
  'enable': 2,
  'ensure': 1,
  'establish': 1,
  'evidence': 1,
  'exercise': 4,
  'extracted': 1,
  'extraction': 1,
  'filtering': 2,
  'format': 1,
  'goal': 1,
  'good': 2,
  'h30': 1,
  'handed': 1,
  'homework': 1,
  'hour': 1,
  'idea': 1,
  'image': 12,
  'imagej': 2,
  'importance': 1,
  'independently': 1,
  'information': 1,
  'intends': 1,
  'interest': 1,
  'introduce': 1,
  'involve': 1,
  'keywords': 1,
  'leading': 1,
  'learning': 1,
  'lecture': 2,
  'life': 3,
  'linear': 1,
  'machine': 1,
  'macro': 3,
  'make': 1,
  'manipulation':

In [9]:
min(globalBagOfWords,key=globalBagOfWords.get)
dict(sorted(globalBagOfWords.items(), key=itemgetter(1), reverse=False)[:5000])
# Maybe discard only the 3 most used words? since system and design are more specific than learning, student and system

{'interconnect': 3,
 'postprimary': 3,
 'inaction': 3,
 'others6': 3,
 'mechanosensory': 3,
 'hyperelliptic': 3,
 'FDD': 3,
 'WINGS': 3,
 'FRP': 3,
 'valentine': 3,
 'curable': 3,
 'epifourier': 3,
 'applications10': 3,
 'emblematic': 3,
 'microdispersed': 3,
 'propagator': 3,
 'photopolymers': 3,
 'equivallent': 3,
 'adaption': 3,
 'aidistributed': 3,
 'techniquesformulate': 3,
 'gyromagnetic': 3,
 'perl': 3,
 'brief': 3,
 'refreshment': 3,
 'poroelasticity': 3,
 'metaloxide': 3,
 'ah20identify': 3,
 'kirchoffs': 3,
 'lawson': 3,
 'EE332': 3,
 'BBC': 3,
 'skundin': 3,
 'oksana': 3,
 'decorrelation': 3,
 'FINAL': 3,
 'NEM': 3,
 'transcriptase': 3,
 '2B': 3,
 'processmicrostructure': 3,
 'pip3signaling': 3,
 'sectoral': 3,
 'nozzle': 3,
 'vestibular': 3,
 'EPMAWDX': 3,
 'light42': 3,
 'dijkstras': 3,
 'T3': 3,
 'spontenaous': 3,
 'URL': 3,
 'multitemporal': 3,
 'principles3': 3,
 'hamming': 3,
 'httpvectorcom': 3,
 'orthopaedic': 3,
 'subunit': 3,
 'materialsapplications': 3,
 'firouzeh

In [28]:
# Test sample of courses (just the first 3)
sampleCourses = [{'courseId': 'MSC-101',
  'description': "Here comes the sun, dudududu, here comes the sun and I say...",
  'name': 'The Beatles'},
                 {'courseId': 'MSC-102',
  'description': "In an octupus's garden, in the sea. He'd let us in...",
  'name': 'The Beatles Too'},
                 {'courseId': 'MSC-103',
    'description': "Born, to be wiiiiild dudududu",
    'name': 'Steppenwolf'}]
                 

In [11]:
sampleGlobalBagOfWords = {}
sampleGlobalBagOfWords = getGlobalBagOfWords(sampleCourses)
print(sampleGlobalBagOfWords)
print(sum(sampleGlobalBagOfWords.values()))

{'octupuss': 1, 'I': 1, 'sea': 1, 'hed': 1, 'sun': 2, 'dudududu': 1, 'garden': 1}
8


In [12]:
sampleBagOfWords = {}
sampleGlobalBagOfWords = {}
getBagOfWords(sampleCourses[1]['description'])
for course in sampleCourses:
    sampleBow = getBagOfWords(course['description'])
    sampleBagOfWords[course['courseId']] = sampleBow
    mergeBow(sampleGlobalBagOfWords,sampleBow)
#test_course = sampleCourses.copy()
#print(sampleBagOfWords)
print(sampleGlobalBagOfWords)
#print(sum(sampleBagOfWords.values()))
print(sum(sampleGlobalBagOfWords.values()))

{'octupuss': 1, 'I': 1, 'sea': 1, 'hed': 1, 'sun': 2, 'dudududu': 1, 'garden': 1}
8


In [13]:
#courses
#globalBagOfWord

In [14]:
# Term Document Matrix
# We want a matrix where each row i is a word (among global bag of words) 
# and each column j is a document (among all courses)
# tdm[i][j] = nb of occurences of term i in doc j

def getTermDocMatrix(courses):
    
    # get global bag of word (for all courses combined) 
    globalBagOfWord = getGlobalBagOfWords(courses)
    
    # total number of terms
    M = len(globalBagOfWord)

    # total number of documents
    N = len(courses)

    termDocMatrix = np.zeros((M,N), dtype=np.int)

    # Column index
    docIndx = 0

    for doc in courses: 
        bow = getBagOfWords(doc['description'])
        # Row index
        termIndx = 0
        for word in globalBagOfWord.keys():
            termDocMatrix[termIndx][docIndx] += bow.get(word, 0)
            termIndx += 1
        docIndx += 1
    return termDocMatrix

In [15]:
print(getTermDocMatrix(sampleCourses))
print(getTermFrequency(sampleCourses))
print(getImportance(sampleCourses))
print(getInverseDocFrequency(sampleCourses))

[[0 1]
 [1 0]
 [0 1]
 [0 1]
 [2 0]
 [1 0]
 [0 1]]


NameError: name 'getTermFrequency' is not defined

In [92]:
# Compute fij (frequency of term i in doc j), à transformer en fonction?

def getTermFrequency():
    # get global bag of word (for all courses combined) 
    globalBagOfWord, bagOfWords = getGlobalBagOfWords()

    totalWords = len(globalBagOfWord)
    totalCourses = len(courses)

    f = np.zeros((totalWords,totalCourses),dtype=np.double)

    docIndx = 0
    for bow in bagOfWords.values():
        wordIndx = 0
        for word in bow:
            f[wordIndx][docIndx] = bow.get(word,0) / len(bow) 
            wordIndx += 1
        docIndx += 1
    return f
getTermFrequency()

array([[ 7.        ,  4.        ,  0.52941176, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.        ,  1.66666667,  0.88235294, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.33333333,  0.94117647, ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [17]:
# Compute TFij "importance of word i in doc j"
def getImportance(courses):
    # get global bag of word (for all courses combined) 
    globalBagOfWord = getGlobalBagOfWords(courses)

    totalWords = len(globalBagOfWord)
    totalCourses = len(courses)
    
    TF = np.zeros((totalWords,totalCourses),dtype=np.double)
    maxWordOfDoc = [""]*totalCourses
    
    # Compute term frequency
    f = np.zeros((totalWords,totalCourses),dtype=np.double)
    f = getTermFrequency(courses)
    
    # Find the number of occurences of the most used words in every document
    docIndx = 0
    for doc in courses:
        bow = getBagOfWords(doc['description'])
        maxWordOfDoc[docIndx] = list(bow.values())[np.argmax(f[docIndx])]
        docIndx += 1

    # Compute TF
    # Note: on a pas besoin d'itérer sur toutes les lignes, juste les colonnes mais j'arrive pas à utiliser 
    # la fonction np.apply_along_axis en passant une fonction qui doit connaitre l'indice de la ligne
    for i in range(totalWords):
        for j in range(totalCourses):
            TF[i][j] = f[i][j] / maxWordOfDoc[j]
    return TF

In [18]:
# Inverse Document Frequency : IDF
# Compute n[i] = nb of documents where word i occurs at least once
def getInverseDocFrequency(courses):
    # get global bag of word (for all courses combined) 
    globalBagOfWord = getGlobalBagOfWords(courses)
    totalWords = len(globalBagOfWord)
    totalCourses = len(courses)

    # Compute term frequency
    f = np.zeros((totalWords,totalCourses),dtype=np.double)
    f = getTermFrequency(courses)
    
    n = np.zeros((totalWords),dtype=np.int)
    IDF = np.zeros((totalWords),dtype=np.double)

    for i in range(totalWords):
        for j in range(totalCourses):
            if(f[i][j] != 0):
                n[i] += 1
        if(n[i] == 0):
            IDF[i] = 0
        else:
            IDF[i] = -math.log2(n[i]/totalCourses)
    return IDF

In [19]:
# Compute TF-IDF score

# get global bag of word (for all courses combined) 
globalBagOfWord = getGlobalBagOfWords(sampleCourses)

totalWords = len(globalBagOfWord)
totalCourses = len(sampleCourses)

# Get term frequency
TF = np.zeros((totalWords,totalCourses),dtype=np.double)
TF = getImportance(sampleCourses)

# Get inverse document frequency
IDF = np.zeros((totalWords),dtype=np.double)
IDF = getInverseDocFrequency(sampleCourses)
    
TFIDF = np.zeros((totalWords,totalCourses),dtype=np.double)

# Compute TF-IDF
for i in range(totalWords):
    for j in range(totalCourses):
        TFIDF[i][j] = TF[i][j] * IDF[i]
print(TFIDF)

[[-0.   -0.  ]
 [-0.   -0.  ]
 [-0.   -0.  ]
 [ 0.    0.25]
 [ 0.    0.  ]
 [ 0.    0.  ]
 [ 0.    0.  ]]


From the slides:

N = total number of documents

f[i][j] nb of occurrences of word 𝑖 in doc 𝑗, so bagOfWord(j)[i]

tf[i][j] = f[i][j] / max_k f[k][j]

idf[i] = -log_2(number of documents where word i occurs at least once / N)

tfidf[i][j] = tf[i][j] * idf[i]

In [26]:
def sim(doc1, doc2):
    return np.dot(np.transpose(doc1), doc2) / (np.linalg.norm(doc1) * np.linalg.norm(doc2))

In [31]:
print(sampleCourses)
tdc = getTermDocMatrix(sampleCourses)
print(tdc)
sim(tdc[:,0], tdc[:,2])

[{'name': 'The Beatles', 'description': 'Here comes the sun, dudududu, here comes the sun and I say...', 'courseId': 'MSC-101'}, {'name': 'The Beatles Too', 'description': "In an octupus's garden, in the sea. He'd let us in...", 'courseId': 'MSC-102'}, {'name': 'Steppenwolf', 'description': 'Born, to be wiiiiild dudududu', 'courseId': 'MSC-103'}]
[[0 1 0]
 [1 0 0]
 [0 1 0]
 [0 0 1]
 [0 1 0]
 [2 0 0]
 [1 0 1]
 [0 0 1]
 [0 1 0]]


0.23570226039551587

## Exercise 4.2: Term-document matrix

In [229]:
a = np.array(((1,2),(3,4),(5,6),(7,8),(9,10)))
print(a)
b = np.tile(np.array((10,100,1000,10000,100000)),(2,1)).T
print(b)
a*b

[[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]]
[[    10     10]
 [   100    100]
 [  1000   1000]
 [ 10000  10000]
 [100000 100000]]


array([[     10,      20],
       [    300,     400],
       [   5000,    6000],
       [  70000,   80000],
       [ 900000, 1000000]])

In [7]:
globalBagOfWords, bagOfWords = getGlobalBagOfWords()

In [42]:
numTerms = len(globalBagOfWords.keys())
numCourses = len(bagOfWords.keys())
termsOrder = list(enumerate(globalBagOfWords.keys()))
coursesOrder = list(enumerate(bagOfWords.keys()))
idx2Term = {i[0]: i[1] for i in termsOrder}
term2Idx = {v: k for k,v in idx2Term.items()}
idx2Course = {i[0]: i[1] for i in coursesOrder}
course2Idx = {v: k for k,v in idx2Course.items()}
np.save('idx2Term', idx2Term)
np.save('term2Idx', term2Idx)
np.save('idx2Course', idx2Course)
np.save('course2Idx', course2Idx)

In [38]:
tf = np.zeros((numTerms,numCourses))
overallFreq = np.zeros(numTerms)
for courseIdx, course in coursesOrder:
    if(len(bagOfWords[course]) == 0):
        continue
    docMax = max(bagOfWords[course].values())
    for termIdx, term in termsOrder:
        if(term not in bagOfWords[course]):
            continue
        # We use the double normalization 0.5 for the tf
        tf[termIdx][courseIdx] = 0.5+0.5*bagOfWords[course][term]/docMax
        overallFreq[termIdx] += 1
overallFreq = np.log(numCourses/overallFreq)

In [39]:
tf_idf = tf*np.tile(overallFreq,(numCourses,1)).T
np.save('X',tf_idf)
for term in map(lambda x: idx2Term[x],tf_idf[:,course2Idx[name2id['Internet analytics']]].argsort()[-15:][::-1]):
    print(term)

hadoop
recommender
ecommerce
auction
ad
realworld
mapreduce
advertisement
selfcontained
mining
service
seek
foundational
spark
networking


## Exercise 4.3: Document similarity search

In [11]:
def docSimilarity(di,dj):
    return np.dot(di,dj)/(np.sqrt(np.dot(di,di))*np.sqrt(np.dot(dj,dj)))

In [37]:
top = (tf_idf[term2Idx['facebook']]+tf_idf[term2Idx['markov']]+tf_idf[word2Idx['chain']]).argsort()[-5:][::-1]
topCol = list(map(lambda x: tf_idf[:,x],top))
topCourses = list(map(lambda x: id2name[idx2Course[x]],top))
cmp = np.zeros((5,5))
for i in range(5):
    for j in range(5):
        cmp[i][j] = docSimilarity(topCol[i],topCol[j])
print(topCourses)
print(cmp)

data = dict(
    courses=topCourses,
    course0=cmp[0],
    course1=cmp[1],
    course2=cmp[2],
    course3=cmp[3],
    course4=cmp[4]
)
source = ColumnDataSource(data)
columns = [TableColumn(field='courses', title='Similarity')]
columns = columns + list(map(lambda x: TableColumn(field='course'+str(x[0]), title=x[1]),enumerate(topCourses)))
data_table = DataTable(source=source,columns=columns)
show(widgetbox(data_table))

['Markov chains and algorithmic applications', 'Applied probability & stochastic processes', 'Applied stochastic processes', 'Internet analytics', 'Optimization and simulation']
[[ 1.          0.20735236  0.1435479   0.13775226  0.13900297]
 [ 0.20735236  1.          0.13376437  0.07090623  0.15045302]
 [ 0.1435479   0.13376437  1.          0.05314457  0.10490284]
 [ 0.13775226  0.07090623  0.05314457  1.          0.09714256]
 [ 0.13900297  0.15045302  0.10490284  0.09714256  1.        ]]
