# Text 1: Vector space models
**Internet Analytics - Lab 4**

---

**Group:** J

**Names:**

* Rafael Bischof
* Jeniffer Lima Graf
* Alexander Sanchez

---

#### Instructions

*This is a template for part 1 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [2]:
import pickle
import numpy as np
from scipy.sparse import csr_matrix
from utils import load_json, load_pkl
from string import punctuation
from collections import Counter
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import scipy as sc

ps = PorterStemmer()
lz = WordNetLemmatizer() 

courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

In [3]:
stopwords |= {'cathedra', 'ex', 'course', 'exam', 'project', 'homework', 'student', 'professor', 'school', 'learn', 'learning', 'final', 'midterm', 'assessment', 'semester', 'prerequisite'}

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/rbischof/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
sc.__version__

'0.19.1'

## Exercise 4.1: Pre-processing

In [6]:
def createNgrams(l, N):
    grams = []
    for i in range(len(l)-1):
        cword = l[i]
        for n in range(N-1):
            if i+n+1 < len(l):
                cword += " " + l[i+n+1]
                grams.append(cword)
    return l + grams

In [7]:
cs = {}
wordcount = {}
for course in courses:
    l = [lz.lemmatize(w) for w in re.sub(r'[^\w\s]|[0-9]', '', course['description']).lower().split() if lz.lemmatize(w) not in stopwords]
    
    # add 2grams (more makes little sense)
    l = createNgrams(l, 2)
    
    # keep track of number of occurrences of words
    for w in l:
        if w in wordcount:
            wordcount[w] += 1
        else:
            wordcount.update({w:1})
    
    cs.update({course['courseId']: l})

In [8]:
filterwords = set()
for w in wordcount:
    if (wordcount[w] > 2 and wordcount[w] < 1000):
        filterwords.add(w)

In [9]:
# thow away most frequent and least frequent words
for course in cs:
    cs[course] = [w for w in cs[course] if w in filterwords]

## Exercise 4.2: Term-document matrix

In [10]:
courseslist = list(cs.keys())

In [11]:
coursesIdx = {k: v for v, k in enumerate(courseslist)}
idxCourses = {v: k for v, k in enumerate(courseslist)}

In [12]:
wordslist = list(filterwords)

In [13]:
wordsIdx = {k: v for v, k in enumerate(wordslist)}
idxWords = {v: k for v, k in enumerate(wordslist)}

In [14]:
# calculate sparse TF matrix
TFs = []
rows = []
cols = []
for c in range(len(courseslist)):
    occs = Counter(cs[courseslist[c]])
    templist = []
    mx = 0
    for w in occs:
        mx = max(mx, occs[w])
        templist.append(occs[w])
        rows.append(wordsIdx[w])
        cols.append(c)
    TFs += [x / mx for x in templist]

In [15]:
IDF = np.zeros((len(wordslist)))

In [16]:
# calculate IDF array
prelog = np.log2(len(courseslist))
for w in range(len(wordslist)):
    occ = 0
    for c in range(len(courseslist)):
        if wordslist[w] in cs[courseslist[c]]:
            occ += 1
    IDF[w] = - np.log2(occ) + prelog

In [17]:
# calculate sparse TFIDF matrix
TFIDFs = [TFs[i] * IDF[rows[i]] for i in range(len(TFs))]

In [18]:
def topicsForCourseSparse(index, n=15, M=TFIDFs, r=rows, c=cols):
    top = []
    toprows = []
    for i in range(len(M)):
        if index == c[i]:
            top.append(M[i])
            toprows.append(r[i])
    indexes = np.argsort(top)[:-n-1:-1]
    topics = []
    for i in indexes:
        topics.append((top[i], wordslist[toprows[i]]))
    return topics

In [19]:
# get 15 highest ranked words for course IX
topicsForCourseSparse(coursesIdx['COM-308'])

[(4.8690461298102452, 'social networking'),
 (4.1089477813024056, 'online'),
 (4.1020865059328893, 'realworld'),
 (3.8406743688962703, 'social'),
 (3.7080820823665643, 'data mining'),
 (3.5638498912278198, 'explore'),
 (3.3690461298102452, 'mining'),
 (3.2080820823665643, 'networking'),
 (2.9126974198734965, 'hadoop'),
 (2.9126974198734965, 'community detection'),
 (2.8690461298102452, 'largescale'),
 (2.7177099196331111, 'recommender system'),
 (2.7177099196331111, 'recommender'),
 (2.7177099196331111, 'ecommerce'),
 (2.6072651517817387, 'service')]

The highest scores are obtained by terms that appear very prominently in the given course, but not so much in the rest of the corpus. These words therefore give us the most information about the course to distinguish it from others.

## Exercise 4.3: Document similarity search

In [34]:
def topCoursesForWordSparse(word, n=15, M=TFIDFs, r=rows, c=cols):
    top = []
    topcols = []
    for i in range(len(M)):
        if wordsIdx[word] == r[i]:
            top.append(M[i])
            topcols.append(c[i])
    indexes = np.argsort(top)[:-n-1:-1]
    topcs = []
    for i in indexes:
        topcs.append(courses[topcols[i]]['name'])
    return topcs

In [35]:
# We didn't manage to calculate the similarity between courses without using this library
tf_idf = sc.sparse.csc_matrix((TFIDFs,(rows,cols)))

In [36]:
def simSparse(i, j, M=TFIDFs, r=rows, c=cols):
    di = tf_idf.getcol(i).todense().reshape(-1)
    dj = tf_idf.getcol(j).todense()
    return np.dot(di, dj) / ( np.linalg.norm(di) * np.linalg.norm(dj) )

In [37]:
def similaritiesCourses(courses):
    indexes = []
    for c in courses:
        indexes.append(coursesIdx[c])
        
    m = np.zeros((5,5))
    for i1 in range(len(indexes)):
        for i2 in range(len(indexes)):
            m[i1, i2] = simSparse(indexes[i1], indexes[i2])
    return m

In [38]:
print('Top 5 courses for \'markov chain\'', topCoursesForWordSparse('markov chain', 5))
print('Similarity between top courses')
print(similaritiesCourses(topCoursesForWordSparse('markov chain', 5)))

Top 5 courses for 'markov chain' ['Markov chains and algorithmic applications', 'Applied probability & stochastic processes', 'Applied stochastic processes', 'Stochastic calculus I', 'Optimization and simulation']
Similarity between top courses


KeyError: 'Markov chains and algorithmic applications'

In [39]:
print('Top course for \'facebook\'', topCoursesForWordSparse('facebook', 5))
print('It is actually the only course that contains the word facebook.')

Top course for 'facebook' ['Computational Social Media']
It is actually the only course that contains the word facebook.


In [40]:
with open('data/preprocessedcourses.pickle', 'wb') as handle:
    pickle.dump(cs, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/idxWords.pickle', 'wb') as handle:
    pickle.dump(idxWords, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/idxCourses.pickle', 'wb') as handle:
    pickle.dump(idxCourses, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/wordsIdx.pickle', 'wb') as handle:
    pickle.dump(wordsIdx, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/coursesIdx.pickle', 'wb') as handle:
    pickle.dump(coursesIdx, handle, protocol=pickle.HIGHEST_PROTOCOL)
sc.sparse.save_npz("data/TFIDF.npz", tf_idf)