# Text 1: Vector space models
**Internet Analytics - Lab 4**

---

**Group:** J

**Names:**

* Rafael Bischof
* Jeniffer Lima Graf
* Alexander Sanchez

---

#### Instructions

*This is a template for part 1 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [1]:
import pickle
import numpy as np
from scipy.sparse import csr_matrix
from utils import load_json, load_pkl
from string import punctuation
from collections import Counter
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
ps = PorterStemmer()
lz = WordNetLemmatizer() 

courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

In [2]:
stopwords |= {'cathedra', 'ex', 'course', 'exam', 'project', 'homework', 'student', 'professor', 'school', 'learn', 'learning', 'final', 'midterm', 'assessment', 'semester', 'prerequisite'}

## Exercise 4.1: Pre-processing

In [3]:
def createNgrams(l, N):
    grams = []
    for i in range(len(l)-1):
        cword = l[i]
        for n in range(N-1):
            if i+n+1 < len(l):
                cword += " " + l[i+n+1]
                grams.append(cword)
    return l + grams

In [4]:
cs = {}
wordcount = {}
for course in courses:
    l = [lz.lemmatize(w) for w in re.sub(r'[^\w\s]|[0-9]', '', course['description']).lower().split() if lz.lemmatize(w) not in stopwords]
    
    # add 2grams (more makes little sense)
    l = createNgrams(l, 2)
    
    # keep track of number of occurrences of words
    for w in l:
        if w in wordcount:
            wordcount[w] += 1
        else:
            wordcount.update({w:1})
    
    cs.update({course['courseId']: l})

In [5]:
filterwords = set()
for w in wordcount:
    if wordcount[w] > 10 and wordcount[w] < 1000:
        filterwords.add(w)

In [6]:
# thow away most frequent and least frequent words
for course in cs:
    cs[course] = [w for w in cs[course] if wordcount[w] < 1000 and wordcount[w] > 10]

## Exercise 4.2: Term-document matrix

In [7]:
courseslist = list(cs.keys())

In [39]:
coursesIdx = {k: v for v, k in enumerate(courseslist)}
idxCourses = {v: k for v, k in enumerate(courseslist)}

In [11]:
wordslist = list(filterwords)

In [40]:
wordsIdx = {k: v for v, k in enumerate(wordslist)}
idxWords = {v: k for v, k in enumerate(wordslist)}

In [13]:
TF = np.zeros((len(wordslist), len(courseslist)))

In [18]:
# calculate TF matrix
for c in range(len(courseslist)):
    occs = Counter(cs[courseslist[c]])
    for w in range(len(wordslist)):
        TF[w, c] = occs[wordslist[w]]

TF /= np.max(TF, axis=0)

In [19]:
IDF = np.zeros((len(wordslist)))

In [20]:
# calculate IDF array
prelog = np.log2(len(courseslist))
for w in range(len(wordslist)):
    occ = 0
    for c in range(len(courseslist)):
        if wordslist[w] in cs[courseslist[c]]:
            occ += 1
    IDF[w] = - np.log2(occ) + prelog

In [21]:
# calculate TFIDF matrix
TFIDF = TF * IDF.reshape(-1,1)

In [22]:
def topicsForCourse(index, n=15):
    indexes = np.argsort(TFIDF[:, index])[:-n-1:-1]
    topics = []
    for i in indexes:
        topics.append(wordslist[i])
    return topics

In [26]:
# get 15 highest ranked words for course IX
topicsForCourse(coursesIdx['COM-308'])

['online',
 'realworld',
 'social',
 'explore',
 'mining',
 'networking',
 'largescale',
 'service',
 'internet',
 'stream',
 'data',
 'ad',
 'clustering',
 'analytics',
 'community']

## Exercise 4.3: Document similarity search

In [27]:
def topCoursesForWord(word, n):
    indexes = np.argsort(TFIDF[wordsIdx[word]])[:-n-1:-1]
    top = []
    for i in indexes:
        top.append(courseslist[i])
    return top

In [28]:
def sim(i, j):
    di = TFIDF[:,i]
    dj = TFIDF[:,j]
    return np.dot(di, dj) / ( np.linalg.norm(di) * np.linalg.norm(dj) )

In [31]:
def similaritiesCourses(courses):
    indexes = []
    for c in courses:
        indexes.append(coursesIdx[c])
        
    m = np.zeros((5,5))
    for i1 in range(len(indexes)):
        for i2 in range(len(indexes)):
            m[i1, i2] = round(sim(indexes[i1], indexes[i2]), 3)
    return m

In [32]:
print('Top 5 courses for \'markov chain\'', topCoursesForWord('markov chain', 5))
print('Similarity between top courses')
print(similaritiesCourses(topCoursesForWord('markov chain', 5)))

Top 5 courses for 'markov chain' ['COM-516', 'MGT-484', 'MATH-332', 'FIN-408', 'COM-308']
Similarity between top courses
[[1.    0.444 0.469 0.254 0.131]
 [0.444 1.    0.528 0.221 0.116]
 [0.469 0.528 1.    0.263 0.119]
 [0.254 0.221 0.263 1.    0.05 ]
 [0.131 0.116 0.119 0.05  1.   ]]


In [33]:
print('Top 5 courses for \'facebook\'', topCoursesForWord('facebook', 5))
print('Similarity between top courses')
print(similaritiesCourses(topCoursesForWord('facebook', 5)))

KeyError: 'facebook'

In [41]:
import pickle
with open('preprocessedcourses.pickle', 'wb') as handle:
    pickle.dump(cs, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('idxWords.pickle', 'wb') as handle:
    pickle.dump(idxWords, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('idxCourses.pickle', 'wb') as handle:
    pickle.dump(idxCourses, handle, protocol=pickle.HIGHEST_PROTOCOL)