# Code to perform the lemmatization/tokenization of the 20 NewsGroups dataset
* It saves a dictionary with tokenized docs, category of each doc and number of docs

Imports

In [None]:
import codecs 
from glob import glob
import os
import pickle
import copy
from time import time

## 1. IMPORTING DOCS FROM 20 NEWSGROUPS DATASET

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## INPUT PARAMETERS

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories, shuffle=True, random_state=42)

#### TOTAL NUMBER OF DOC

In [None]:
n_docs = newsgroups_train.filenames.shape[0]
n_docs

## COMPLETE LEMMATIZATION

In [None]:
# Return the casting of the original tag in a single
# character which is accepted by the lemmatizer
import nltk.corpus  # splits on punctuactions   
stop_words = nltk.corpus.stopwords.words('english')

import re
def get_wordnet_pos(treebank_tag):

    # I recognize the initial character of the word, identifying the type
    if treebank_tag.startswith('J'):
        return nltk.corpus.reader.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.reader.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.reader.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.reader.wordnet.ADV
    else:
        return None

from nltk import word_tokenize, pos_tag        
from nltk.stem import WordNetLemmatizer 
def LemmaTokenizer_func(doc,cat):
    wnl = WordNetLemmatizer()
    doc = doc.lower()
    tokenized_doc = word_tokenize(doc) # splits on punctuactions  
    tagged_doc = pos_tag(tokenized_doc)
        
    lemmatized_doc = []
    # Scan the (word, tag) tuples which are the elements of tagged_tweet1
    for word, tag in tagged_doc:
        ret_value = get_wordnet_pos(tag)
        # If the function does not return None I provide the ret_value
        if ret_value != None:
            lemmatized_doc.append(wnl.lemmatize(word, get_wordnet_pos(tag)))
        # If the function returns None I do not provide the ret_value
        else:
            lemmatized_doc.append(wnl.lemmatize(word))
    
    nonumbers_nopunct_lemmatized_doc = [word for word in lemmatized_doc if re.search('[a-zA-Z]{2,}', word)]
    lemmatized_doc_stopw = [word for word in nonumbers_nopunct_lemmatized_doc if word not in stop_words]
    lemmatized_doc_stopw = ' '.join(lemmatized_doc_stopw)

    return (lemmatized_doc_stopw, cat) #[self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [None]:
cat_docs_original = map(lambda x: newsgroups_train.target_names[x], newsgroups_train.target)

In [None]:
from joblib import Parallel, delayed  
import multiprocessing
t0 = time()
num_cores = multiprocessing.cpu_count()

tokenized_docs = Parallel(n_jobs=num_cores)(delayed(LemmaTokenizer_func)(doc,cat_doc) for doc,cat_doc in zip(newsgroups_train.data,cat_docs_original))
print("Full lemmatization done in %0.3fs." % (time() - t0))

In [None]:
cwd =  os.getcwd() # Prints the working directory
results_dir_path = cwd + '/results/'

output = open(results_dir_path + '/lemmatized_text_n_docs' + str(n_docs) + '.pkl', 'w')

pickle.dump({'tokenized_docs':tokenized_docs,
             'n_docs':n_docs}, output) #space of the parameters spanned with the grid search
output.close()