In [2]:
#Importing the dependencies
import re, string, unicodedata, itertools, random, os, glob, math, time, json
import pickle
import nltk
from nltk import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.spatial import distance_matrix
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

from __future__ import print_function
print(__doc__)

%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib.cm as cm
plt.rcParams['figure.figsize'] = (24, 9)
plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to /home/rgomes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Automatically created module for IPython interactive environment


In [4]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[0-9]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stemming(words):
    """Reduce words to radical"""
    stemmer = SnowballStemmer('english')
    new_words = []
    for word in words:
        new_word = stemmer.stem(word)
        new_words.append(new_word)
    return new_words

def trimAndClear(words):    
    """Remove words with only 2 letters, whitespace and empty"""
    new_words = map(lambda w : w.strip(), words)
    new_words = list(filter(lambda w: w != '' and len(w) > 2, new_words))
    return new_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    words = stemming(words)
    words = trimAndClear(words)
    return words

In [47]:
filename = './Process+Mining+Abstracts.txt'

def loadCorpus():
    voc = set()
    docs = []
       
    with open(filename, 'rb') as f:
        print('Loading', filename)

        for line in f:
            # readline
            line = line.decode("utf-8", "ignore")
            
            # read the email
            tokens = line.split(' ')
            
            words = normalize(tokens)
            if len(words) > 1:
                voc.update(words)
                text = ' '.join(words)
                docs.append(text)

    # save as pickle
    print('Saving', len(voc), 'unique words')
    tokens_file = open('./representations/tokens_vector_abstracts.pkl', 'wb')
    pickle.dump(list(voc), tokens_file)
    print('saved...')
    
    print('Saving', len(docs), 'documents')
    d_file = open('./representations/documents_abstracts.pkl', 'wb')
    pickle.dump(list(docs), d_file)
    print('saved...')
    
loadCorpus()

Loading ./20_news/Process+Mining+Abstracts.txt
Saving 5746 unique words
saved...
Saving 1387 documents
saved...


In [51]:
data_f = open('./representations/tokens_vector_abstracts.pkl', 'rb')
voc = pickle.load(data_f)
print(len(voc), 'tokens loaded')

data_f = open('./representations/documents_abstracts.pkl', 'rb')
docs = pickle.load(data_f)
print(len(docs), 'tokens loaded')

5746 tokens loaded
1387 tokens loaded


In [52]:
def tf(term, document):
    return freq(term, document)

def freq(term, document):
    return document.split().count(term)

In [53]:
tf_matrix = []

print('calculating tf matrix...')
for doc in docs:
    
    #print('The doc is "' + doc + '"')
    tf_vector = [tf(word, doc) for word in voc]
    #tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
    #print('The tf vector for Document %d is [%s]' % ((docs.index(doc)+1), tf_vector_string))
    tf_matrix.append(tf_vector)
    
print(np.shape(tf_matrix))
print(tf_matrix[0])

calculating tf matrix...
(1387, 5746)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [54]:
print('saving tf matrix...')
np.savetxt('./representations/tf_matrix_abstracts.txt', tf_matrix)
print('saved')

saving tf matrix...
saved


In [56]:
import math

def l2_normalizer(vec):
    denom = np.sum([el**2 for el in vec])
    return [(el / math.sqrt(denom)) for el in vec]

doc_term_matrix_l2 = []
for vec in tf_matrix:
    doc_term_matrix_l2.append(l2_normalizer(vec))

In [58]:
print(doc_term_matrix_l2[0])
print('saving tf matrix normalized...')
np.savetxt('./representations/tf_matrix_normalized_abstracts.txt', doc_term_matrix_l2)
print('saved')

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.043478260869565216, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

saved


In [60]:
def numDocsContaining(word, doclist):
    doccount = 0
    for doc in doclist:
        if freq(word, doc) > 0:
            doccount +=1
    return doccount 

def idf(word, doclist):
    n_samples = len(doclist)
    df = numDocsContaining(word, doclist)
    return np.log(n_samples / 1+df)

my_idf_vector = [idf(word, docs) for word in voc]
#print(my_idf_vector)

In [61]:
def build_idf_matrix(idf_vector):
    idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
    np.fill_diagonal(idf_mat, idf_vector)
    return idf_mat

my_idf_matrix = build_idf_matrix(my_idf_vector)
#print(my_idf_matrix)

In [63]:
doc_term_matrix_tfidf = []

#performing tf-idf matrix multiplication
for tf_vector in tf_matrix:
    doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix))

#normalizing
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))
                                    
print(np.shape(np.matrix(doc_term_matrix_tfidf_l2)))
print('saving tf matrix normalized...')
np.savetxt('./representations/tfidf_matrix_abstracts.txt', doc_term_matrix_l2)
print('saved')

(1387, 5746)
saving tf matrix normalized...
saved
