In [1]:
#Importing the dependencies
import re, string, unicodedata, itertools, random, os, glob, math, time, json
import pickle
import nltk
from nltk import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.spatial import distance_matrix
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

from __future__ import print_function
print(__doc__)

%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib.cm as cm
plt.rcParams['figure.figsize'] = (24, 9)
plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to /home/rgomes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Automatically created module for IPython interactive environment


# Pré processing the corpora
The first step that we used was to remove special characters
such as ”#”, ”@”, and ”/” and irrelevant information such
as email addresses and numbers.

Then, we used
tokenization the text to read words and check whether these
were relevant or irrelevant term. We ignored the subject header,
email addresses, numbers, and punctuation
The next step that we applied was the removal of stop
words.

The third step was a word stemmer.

In [2]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[0-9]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stemming(words):
    """Reduce words to radical"""
    stemmer = SnowballStemmer('english')
    new_words = []
    for word in words:
        new_word = stemmer.stem(word)
        new_words.append(new_word)
    return new_words

def trimAndClear(words):    
    """Remove words with only 2 letters, whitespace and empty"""
    new_words = map(lambda w : w.strip(), words)
    new_words = list(filter(lambda w: w != '' and len(w) > 2, new_words))
    return new_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    words = stemming(words)
    words = trimAndClear(words)
    return words

In [3]:
def loadCorpus(files):
    voc = set()
    docs = []
    # build the vocabulary
    for filename in files:
       
        with open(filename, 'rb') as f:
            print('Loading', filename)
            index = 0
            tmp = []
            for line in f:
                # readline
                line = line.decode("utf-8", "ignore")

                if re.match('^(Newsgroup:)', line):
                    # email found

                    # save the full email readble
                    email = ' '.join(list(itertools.chain.from_iterable(tmp)))
                    if email != ' ':
                        docs.append(email)

                    index = 0
                    tmp = []

                index += 1

                # achei o header
                # pula 3 linhas
                if index > 4:
                    # read the email
                    tokens = line.split(' ')
                    words = normalize(tokens)

                    voc.update(words)
                    tmp.append(words)       

    # save as pickle
    print('Saving', len(voc), 'unique words')
    tokens_file = open('./representations/tokens_vector.pkl', 'wb')
    pickle.dump(list(voc), tokens_file)
    print('saved...')
    
    print('Saving', len(docs), 'documents')
    d_file = open('./representations/documents.pkl', 'wb')
    pickle.dump(list(docs), d_file)
    print('saved...')


In [4]:
def tf(term, document):
    return freq(term, document)

def freq(term, document):
    return document.split().count(term)

# Load the files

In [5]:
print('Welcome to Preprocessing')
files = [
    '20_news/sci.space.txt',
    '20_news/talk.religion.misc.txt',
    '20_news/talk.politics.misc.txt',
    '20_news/rec.sport.baseball.txt'
]
    
t0 = time.time()
loadCorpus(files)
print('Time to preprocess all files:', (time.time()-t0))

Welcome to Preprocessing
Loading 20_news/sci.space.txt
Loading 20_news/talk.religion.misc.txt
Loading 20_news/talk.politics.misc.txt
Loading 20_news/rec.sport.baseball.txt
Saving 34666 unique words
saved...
Saving 6768 documents
saved...
Time to preprocess all files: 312.31193804740906


In [None]:
data_f = open('./representations/tokens_vector.pkl', 'rb')
voc = pickle.load(data_f)
print(len(voc), 'tokens loaded')

data_f = open('./representations/documents.pkl', 'rb')
docs = pickle.load(data_f)
print(len(docs), 'tokens loaded')

34666 tokens loaded
6768 tokens loaded


In [None]:
tf_matrix = []

print('calculating tf matrix...')
for doc in docs:
    
    #print('The doc is "' + doc + '"')
    tf_vector = [tf(word, doc) for word in voc]
    #tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
    #print('The tf vector for Document %d is [%s]' % ((docs.index(doc)+1), tf_vector_string))
    tf_matrix.append(tf_vector)
    
print(np.shape(tf_matrix))

calculating tf matrix...


In [None]:
print('saving tf matrix...')
np.savetxt('./representations/tf_matrix.txt', tf_matrix)
print('saved')

In [None]:
import math

def l2_normalizer(vec):
    denom = np.sum([el**2 for el in vec])
    if (denom != 0):
        return [(el / math.sqrt(denom)) for el in vec]
    else: return [(el for el in vec)]

doc_term_matrix_l2 = []
for vec in tf_matrix:
    doc_term_matrix_l2.append(l2_normalizer(vec))

In [123]:
print('saving tf matrix normalized...')
np.savetxt('./representations/tf_matrix_normalized.txt', doc_term_matrix_l2)
print('saved')

[<generator object l2_normalizer.<locals>.<genexpr> at 0x7f767faef780>]
saving tf matrix normalized...


TypeError: Mismatch between array dtype ('object') and format specifier ('%.18e')

In [None]:
def numDocsContaining(word, doclist):
    doccount = 0
    for doc in doclist:
        if freq(word, doc) > 0:
            doccount +=1
    return doccount 

def idf(word, doclist):
    n_samples = len(doclist)
    df = numDocsContaining(word, doclist)
    return np.log(n_samples / 1+df)

my_idf_vector = [idf(word, t) for word in voc]
#print(my_idf_vector)

In [None]:
def build_idf_matrix(idf_vector):
    idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
    np.fill_diagonal(idf_mat, idf_vector)
    return idf_mat

my_idf_matrix = build_idf_matrix(my_idf_vector)
#print(my_idf_matrix)

In [None]:
doc_term_matrix_tfidf = []

#performing tf-idf matrix multiplication
for tf_vector in tf_matrix:
    doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix))

#normalizing
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))
                                    
print(np.shape(np.matrix(doc_term_matrix_tfidf_l2)))

In [None]:
# --------------- TF PARAMETERS ---------- #

# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)

h_tf = sorted(tf.items(), key=lambda x: x[1], reverse=True)
ax1.plot( list(map(lambda x: x[1], h_tf)) )
ax1.set_title('Tf Scores')
 
ax2.plot( list(filter( lambda y: y >= 0.001 ,map(lambda x: x[1], h_tf)) )  )
ax2.set_title('Tf Parameters : min 0.001 x max 0.004')
plt.show()

In [None]:
# --------------- TFIDF PARAMETERS ---------- #

# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)

h_tfidf = sorted(tfidf.items(), key=lambda x: x[1], reverse=True)
ax1.plot( list(map(lambda x: x[1], h_tfidf)) )
ax1.set_title('TfIdf Scores')
 
plt.plot( list(filter( lambda y: y > 0.0001 and y <= 0.001 ,map(lambda x: x[1], h_tfidf)) )  )
ax2.set_title('TfIdf Parameters : min 0.0001 x max 0.001')
plt.show()

plt.show()