In [None]:
# data preparation with apache spark - pyspark

In [1]:
import nltk
import pandas as pd
import numpy as np
import re
import codecs
import matplotlib.pyplot as plt

In [2]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
# los objetos sc y spark ya pueden estar pre-cargados en otros ambientes como Amazon EMR, Databricks, hortonworks, 
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [3]:
# directorios (path) de entrada y salida:
# 
path_in="../datasets/papers_sample/"
#
# crear este directorio /tmp/papers_out
path_out="/tmp/papers_out/"

In [4]:
# corpus de nltk para 'tokenizer' y 'stopwords'
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/emontoya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emontoya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
from pyspark.sql.functions import udf,col,lower
#read the data
myrdd = sc.wholeTextFiles(path_in+"*.txt")
#transform into a data frame
mydf = myrdd.toDF(schema=['file','content'])

contar = udf(lambda tokens: len(tokens))

#tokenize
tokenizer = Tokenizer(inputCol="content", outputCol="tokens1")
dftokens = tokenizer.transform(mydf)
dftokens = dftokens.withColumn('longitud1',contar(col('tokens1'))).select(['tokens1','longitud1'])
dftokens.show(2)

removersw = StopWordsRemover(inputCol='tokens1', outputCol='tokens2')
dftokens = removersw.transform(dftokens)
dftokens = dftokens.withColumn('longitud2',contar(col('tokens2'))).select(['tokens2','longitud2'])

dftokens.show(2)

filter_udp = udf(lambda x: [w for w in x if len(w) > 1])


dftokens = dftokens.withColumn('tokens3',filter_udp(col('tokens2')))
dftokens = dftokens.withColumn('longitud3',contar(col('tokens3'))).select(['tokens3','longitud3'])
dftokens.show(2)

+--------------------+---------+
|             tokens1|longitud1|
+--------------------+---------+
|[1, , variations,...|    12175|
|[smooth, rényi, ...|     4242|
+--------------------+---------+
only showing top 2 rows

+--------------------+---------+
|             tokens2|longitud2|
+--------------------+---------+
|[1, , variations,...|     8392|
|[smooth, rényi, ...|     3122|
+--------------------+---------+
only showing top 2 rows

+--------------------+---------+
|             tokens3|longitud3|
+--------------------+---------+
|[variations, them...|     6196|
|[smooth, rényi, ...|     2243|
+--------------------+---------+
only showing top 2 rows



In [None]:
# ejemplo de como nltk tokeniza:
texto="texto libre weren't que permite--4 crear  las   hiso1iras epor--4 no se preocupe \n hola mundo cruel"
tokens = nltk.word_tokenize(texto)
print(len(tokens))
print(tokens)

In [None]:
# note la estrategia de tokenizar con sentencias simples de python, 
# ¿ cual le parece mejor?
# y note la diferencia entre .split() y .split(' ')
texto="texto libre weren't que permite--4 crear  las   hiso1iras epor--4 no se preocupe \n hola mundo cruel"
tokens = texto.split()
print(len(tokens))
print(tokens)
tokens = texto.split(' ')
print(len(tokens))
print(tokens)

In [None]:
# otra libreria diferentes de nltk para diccionario de stopwords, cual será mejor?
# $ git clone --recursive git://github.com/Alir3z4/python-stop-words.git
#!pip install stop-words --user

from stop_words import get_stop_words
stop_words = get_stop_words('english')
stop_words = get_stop_words('en')
print(len(stop_words))
print(stop_words)

In [None]:
# stopwords en nltk
from nltk.corpus import stopwords
 
stop_words_nltk = set(stopwords.words('english'))
print(len(stop_words_nltk))
print(stop_words_nltk)

In [None]:
# permite verificar en nltk si un token pertenece a diccionario de un idioma, en este caso a 'english'
from nltk.corpus import words as voc_en
x = len(voc_en.words())
print('tamaño del diccionario en ingles del nltk: ',x)
# verifica si una palabra pertenece al diccionario:
w = "house"
if (len(w) >1) and w.isalpha() and (w in voc_en.words()) and (w not in stop_words):
    print(w," true")
else:
    print(w," false")
    
w = "pepito"
if (len(w) >1) and w.isalpha() and (w in voc_en.words()) and (w not in stop_words):
    print(w," true")
else:
    print(w," false")    

In [None]:
# leer un archivo de ejemplo en .txt
input_file = open(path_in+"0704.3504.txt", "r", encoding='iso-8859-1')
#output_file_filtered = open(path_out+"0704.3504_filtered.txt", "w")
#output_file_drop = open(path_out+"0704.3504_drop.txt", "w")
filedata = input_file.read()

In [None]:
# opción 1:
# TOKENIZAR con .split(), 
# ELIMINAR tokens de long = 1
# ELIMINAR caracteres que no sean alfanumericos y pasar todo a minuscula
# REMOVER stop words de nltk
# graficar los 20 términos más frecuentes:

tokens = filedata.split()
print(len(tokens))
tokens = [re.sub(r'[^A-Za-z0-9]+','',w) for w in tokens]
# tokens=[word for word in tokens if word.isalpha()] si en vez de re.sub(r'[^A-Za-z0-9]+','',w) hace esto, que pasa?
tokens = [w.lower() for w in tokens if len(w)>1]
tokens = [w for w in tokens if w not in stop_words_nltk]

fdist = nltk.FreqDist(tokens)
print('numero de palabras finales = ',len(fdist))
topwords = fdist.most_common(20)
print (topwords)
x,y = zip(*topwords)
plt.figure(figsize=(15,10))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.show()

In [None]:
# opción 2:
# TOKENIZAR con nltk, 
# ELIMINAR tokens de long = 1
# ELIMINAR caracteres que no sean alfanumericos
# REMOVER stop words
# graficar los 20 términos más frecuentes:

tokens = nltk.word_tokenize(filedata)
tokens = [w.lower() for w in tokens if len(w)>1]
tokens = [re.sub(r'[^A-Za-z0-9]+','',w) for w in tokens]
tokens = [w for w in tokens if w not in stop_words_nltk]

fdist = nltk.FreqDist(tokens)
topwords = fdist.most_common(20)
print('numero de palabras finales = ',len(fdist))
print (topwords)
x,y = zip(*topwords)
plt.figure(figsize=(15,10))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Stemming con NLTK

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter = PorterStemmer()
lancaster = LancasterStemmer()

#tokens = [porter.stem(w) for w in tokens]
tokens = [lancaster.stem(w) for w in tokens]

fdist = nltk.FreqDist(tokens)
topwords = fdist.most_common(20)
print('numero de palabras finales = ',len(fdist))
x,y = zip(*topwords)
plt.figure(figsize=(15,10))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Lemmatization con NLTK

from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

tokens = [wordnet_lemmatizer.lemmatize(w, pos="v") for w in tokens ]
#tokens = [wordnet_lemmatizer.lemmatize(w) for w in tokens ]

fdist = nltk.FreqDist(tokens)
topwords = fdist.most_common(20)
print('numero de palabras finales = ',len(fdist))
x,y = zip(*topwords)
plt.figure(figsize=(15,10))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.show()

In [None]:
# leer un archivo de ejemplo en .txt
input_file = open(path_in+"0704.3504.txt", "r", encoding='iso-8859-1')
output_file_clean = open(path_out+"0704.3504_clean.txt", "w")

In [None]:
for line in input_file:
    line_clean = ""
    
    tokens = line.split()
    tokens = [re.sub(r'[^A-Za-z0-9]+','',w) for w in tokens]
    tokens = [w.lower() for w in tokens if len(w)>1]
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if w not in stop_words_nltk]
    
    for w in tokens:
        line_clean=line_clean+w+" "
            
    if (line_clean!=""):
        line_clean=line_clean+"\n"
        output_file_clean.write(line_clean)
output_file_clean.close()        

In [None]:
input_file_clean = open(path_out+"0704.3504_clean.txt", "r", encoding='iso-8859-1')

In [None]:
filedata = input_file_clean.read()
tokens = filedata.split()
fdist = nltk.FreqDist(tokens)
topwords = fdist.most_common(20)
print('numero de palabras finales = ',len(fdist))
x,y = zip(*topwords)
plt.figure(figsize=(15,10))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.show()

In [None]:
word_freq = fdist.most_common(len(fdist))

In [None]:
import csv

with open(path_out+'0704.3504_tf.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(["word", "frecuency"])
    writer.writerows(word_freq)

csvFile.close()

In [None]:
# extract top 30 words
top_words = word_freq[:20]
print(top_words)

In [None]:
import pandas as pd
df = pd.DataFrame(top_words)
df.head()

In [None]:
import matplotlib.pyplot as plt
x,y = zip(*top_words)
plt.figure(figsize=(15,10))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.show()

In [None]:
import matplotlib.pyplot as plt
df = pd.DataFrame(top_words)
plt.figure(figsize=(15,10))
plt.bar(df[0],df[1])
plt.xticks(rotation=45)
plt.xlabel("Word")
plt.ylabel("frecuency")
plt.show()