# Clasificación de palabras (por género de nombre)

In [1]:
import nltk, random
nltk.download('names')
from nltk.corpus import names 

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


**Función básica de extracción de atributos**

In [2]:
# definición de atributos relevantes
def atributos(palabra):
	return {'ultima_letra': palabra[-1]}

tagset = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])

In [3]:
tagset[:10]

[('Aamir', 'male'),
 ('Aaron', 'male'),
 ('Abbey', 'male'),
 ('Abbie', 'male'),
 ('Abbot', 'male'),
 ('Abbott', 'male'),
 ('Abby', 'male'),
 ('Abdel', 'male'),
 ('Abdul', 'male'),
 ('Abdulkarim', 'male')]

In [4]:
random.shuffle(tagset)
tagset[:10]

[('Gisella', 'female'),
 ('Darius', 'male'),
 ('Skippy', 'male'),
 ('Odelia', 'female'),
 ('Irvin', 'male'),
 ('Nathanil', 'male'),
 ('Briggs', 'male'),
 ('Vite', 'male'),
 ('Binny', 'female'),
 ('Harcourt', 'male')]

In [5]:
fset = [(atributos(n), g) for (n, g) in tagset]
train, test = fset[500:], fset[:500]

**Modelo de clasificación Naive Bayes**

In [None]:
# entrenamiento del modelo NaiveBayes
classifier = nltk.NaiveBayesClassifier.train(train)

 **Verificación de algunas predicciones**

In [None]:
classifier.classify(atributos('amanda'))

'female'

In [None]:
classifier.classify(atributos('peter'))

'male'

**Performance del modelo**

In [None]:
print(nltk.classify.accuracy(classifier, test))

0.734


In [None]:
print(nltk.classify.accuracy(classifier, train))

0.7647770016120365


**Mejores atributos**

In [6]:
def mas_atributos(nombre):
    atrib = {}
    atrib["primera_letra"] = nombre[0].lower()
    atrib["ultima_letra"] = nombre[-1].lower()
    for letra in 'abcdefghijklmnopqrstuvwxyz':
        atrib["count({})".format(letra)] = nombre.lower().count(letra)
        atrib["has({})".format(letra)] = (letra in nombre.lower())
    return atrib

In [8]:
mas_atributos('rodrigo')

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 1,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 1,
 'count(h)': 0,
 'count(i)': 1,
 'count(j)': 0,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 0,
 'count(o)': 2,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 2,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': True,
 'has(e)': False,
 'has(f)': False,
 'has(g)': True,
 'has(h)': False,
 'has(i)': True,
 'has(j)': False,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': False,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': True,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'primera_letra': 'r',
 'ultima_letra': 'o'}

In [9]:
fset = [(mas_atributos(n), g) for (n, g) in tagset]
train, test = fset[500:], fset[:500]
classifier2 = nltk.NaiveBayesClassifier.train(train)

In [10]:
print(nltk.classify.accuracy(classifier2, test))
print(nltk.classify.accuracy(classifier2, train))

0.766
0.7787479849543256


### Ejercicio de práctica

**Objetivo:** Construye un classificador de nombres en español usando el siguiente dataset: 
https://github.com/jvalhondo/spanish-names-surnames

1. **Preparación de los datos**: con un `git clone` puedes traer el dataset indicado a tu directorio en Colab, luego asegurate de darle el formato adecuado a los datos y sus features para que tenga la misma estructura del ejemplo anterior con el dataset `names` de nombres en ingles. 

* **Piensa y analiza**: ¿los features en ingles aplican de la misma manera para los nombres en español?

In [None]:
# escribe tu código aquí


2. **Entrenamiento y performance del modelo**: usando el classificador de Naive Bayes de NLTK entrena un modelo sencillo usando el mismo feature de la última letra del nombre, prueba algunas predicciones y calcula el performance del modelo. 

In [None]:
# escribe tu código aquí


3. **Mejores atributos:** Define una función como `atributos2()` donde puedas extraer mejores atributos con los cuales entrenar una mejor version del clasificador. Haz un segundo entrenamiento y verifica como mejora el performance de tu modelo. ¿Se te ocurren mejores maneras de definir atributos para esta tarea particular?

In [None]:
# escribe tu código aquí


# Clasificación de documentos (email spam o no spam)

In [11]:
!git clone https://github.com/pachocamacho1990/datasets

Cloning into 'datasets'...
remote: Enumerating objects: 39, done.[K
remote: Total 39 (delta 0), reused 0 (delta 0), pack-reused 39[K
Unpacking objects: 100% (39/39), done.


In [None]:
import pandas as pd
import numpy as np
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
df = pd.read_csv('datasets/email/csv/spam-apache.csv', names = ['clase','contenido'])
df['tokens'] = df['contenido'].apply(lambda x: word_tokenize(x))
df.head()

Unnamed: 0,clase,contenido,tokens
0,-1,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...","[<, !, DOCTYPE, HTML, PUBLIC, ``, -//W3C//DTD,..."
1,1,> Russell Turpin:\n> > That depends on how the...,"[>, Russell, Turpin, :, >, >, That, depends, o..."
2,-1,Help wanted. We are a 14 year old fortune 500...,"[Help, wanted, ., We, are, a, 14, year, old, f..."
3,-1,Request A Free No Obligation Consultation!\nAc...,"[Request, A, Free, No, Obligation, Consultatio..."
4,1,Is there a way to look for a particular file o...,"[Is, there, a, way, to, look, for, a, particul..."


In [None]:
df['tokens'].values[0]

In [None]:
all_words = nltk.FreqDist([w for tokenlist in df['tokens'].values for w in tokenlist])
top_words = all_words.most_common(200)

def document_features(document):
    document_words = set(document)
    features = {}
    for word in top_words:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [None]:
document_features(df['tokens'].values[0])

In [None]:
fset = [(document_features(texto), clase) for texto, clase in zip(df['tokens'].values, df['clase'].values)]
random.shuffle(fset)
train, test = fset[:200], fset[200:]

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train)

In [None]:
print(nltk.classify.accuracy(classifier, test))

0.48


In [None]:
classifier.show_most_informative_features(5)

Most Informative Features
    contains(('us', 53)) = False               1 : -1     =      1.0 : 1.0
 contains(('could', 66)) = False               1 : -1     =      1.0 : 1.0
 contains(('money', 72)) = False               1 : -1     =      1.0 : 1.0
  contains(('days', 40)) = False               1 : -1     =      1.0 : 1.0
  contains(('need', 43)) = False               1 : -1     =      1.0 : 1.0


In [None]:
df[df['clase']==-1]['contenido']

0      <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tr...
2      Help wanted.  We are a 14 year old fortune 500...
3      Request A Free No Obligation Consultation!\nAc...
10     >\n>“µ×è¹µÑÇ ¡ÑºâÅ¡¸ØÃ¡Ô¨º¹ÍÔ¹àµÍÃìà¹çµ” \n>àµ...
                             ...                        
243    ##############################################...
244    Wanna see sexually curious teens playing with ...
246    REQUEST FOR URGENT BUSINESS ASSISTANCE\n------...
248    Email marketing works!  There's no way around ...
249    Email marketing works!  There's no way around ...
Name: contenido, Length: 125, dtype: object

## Ejercicio de práctica


¿Como podrías construir un mejor clasificador de documentos?

0. **Dataset más grande:** El conjunto de datos que usamos fue muy pequeño, considera usar los archivos corpus que estan ubicados en la ruta: `datasets/email/plaintext/` 

1. **Limpieza:** como te diste cuenta no hicimos ningun tipo de limpieza de texto en los correos electrónicos. Considera usar expresiones regulares, filtros por categorias gramaticales, etc ... . 

---

Con base en eso construye un dataset más grande y con un tokenizado más pulido. 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# escribe tu código aquí:


2. **Validación del modelo anterior:**  
---

una vez tengas el nuevo conjunto de datos más pulido y de mayor tamaño, considera el mismo entrenamiento con el mismo tipo de atributos del ejemplo anterior, ¿mejora el accuracy del modelo resultante?

In [None]:
# escribe tu código aquí:


3. **Construye mejores atributos**: A veces no solo se trata de las palabras más frecuentes sino de el contexto, y capturar contexto no es posible solo viendo los tokens de forma individual, ¿que tal si consideramos bi-gramas, tri-gramas ...?, ¿las secuencias de palabras podrián funcionar como mejores atributos para el modelo?. Para ver si es así,  podemos extraer n-gramas de nuestro corpus y obtener sus frecuencias de aparición con `FreqDist()`, desarrolla tu propia manera de hacerlo y entrena un modelo con esos nuevos atributos, no olvides compartir tus resultados en la sección de comentarios. 

In [None]:
# escribe tu código aquí:


# Se cambian los features para dar mejor precision en la prediccion de clasificacion

In [12]:
import os
import nltk
import random
from nltk import word_tokenize
from nltk.collocations import *
import pandas as pd
nltk.download("punkt")

!git clone https://github.com/pachocamacho1990/datasets
! unzip datasets/email/plaintext/corpus1.zip
! unzip datasets/email/plaintext/corpus2.zip
! unzip datasets/email/plaintext/corpus3.zip

#Funciones para cargar los datasets

# Get Text and labels from folders with plain text files
def get_text_labels_from_folders(folderBase, folderLabels):
  data = []
  labels = []

  for folderLabel in folderLabels:
    for file in os.listdir('{}/{}'.format(folderBase, folderLabel)):
      with open('{}/{}/{}'.format(folderBase, folderLabel, file), encoding='latin-1') as f:
        data.append(f.read())
        labels.append(folderLabel)

  return data, labels

def set_label_num(label_str):
  if label_str == "spam":
      return 1
  else:
      return 0
 
dataCorpus1, labelsCorpus1 = get_text_labels_from_folders('corpus1', ["spam", "ham"])
dataCorpus2, labelsCorpus2 = get_text_labels_from_folders('corpus2', ["spam", "ham"])
dataCorpus3, labelsCorpus3 = get_text_labels_from_folders('corpus3', ["spam", "ham"])
data = dataCorpus1 + dataCorpus2 + dataCorpus3
labels = labelsCorpus1 + labelsCorpus2 + labelsCorpus3

dataframe = pd.DataFrame({'text': data, 'labels': labels})
dataframe = dataframe.sample(frac = 1) 
dataframe['tokens'] = dataframe['text'].apply(lambda x: word_tokenize(x))
dataframe['labels_num'] = dataframe['labels'].apply(lambda x: set_label_num(x))

#Funciones para filtrar palabras y obtener colocaciones de n gramas

def filter_words_by_threshold(text_tokenized, threshold = 3, ):
  words = []
  words = [word for word in text_tokenized if len(word) > threshold]
  return words

def get_n_grams_collocations_from_words(words, freq_filter = 10, n_best= 10,
                                       n_gran_measure = nltk.collocations.BigramAssocMeasures()):
  finder = BigramCollocationFinder.from_words(words)
  finder.apply_freq_filter(freq_filter)
  email_spam_collocations = finder.nbest(n_gran_measure.pmi, n_best)
  return email_spam_collocations

#Obtener las colocaciones y palabras más comunes en el datasets de spam

spamCorpus1, _ = get_text_labels_from_folders('corpus1', ["spam"])
spamCorpus2, _ = get_text_labels_from_folders('corpus2', ["spam"])
spamCorpus3, _ = get_text_labels_from_folders('corpus3', ["spam"])
spamCorpuses = spamCorpus1 + spamCorpus2 + spamCorpus3

filtered_words = []
for text in spamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

email_spam_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
all_spam_words = nltk.FreqDist([w for w in filtered_words])
top_spam_words = all_spam_words.most_common(200)

#Obtener las colocaciones y palabras comunes en el datasets de ham

hamCorpus1, _ = get_text_labels_from_folders('corpus1', ["ham"])
hamCorpus2, _ = get_text_labels_from_folders('corpus2', ["ham"])
hamCorpus3, _ = get_text_labels_from_folders('corpus3', ["ham"])
hamCorpuses = hamCorpus1 + hamCorpus2 + hamCorpus3

filtered_words = []
for text in hamCorpuses:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words

email_ham_collocations = get_n_grams_collocations_from_words(filtered_words, 120, 40)
all_ham_words = nltk.FreqDist([w for w in filtered_words])
top_ham_words = all_ham_words.most_common(200)

#Filtrar palabras repetidas de palabras más comunes en spam y ham.

top_ham_words_iterator = top_ham_words
for word in top_ham_words_iterator:
  if word in top_ham_words and word in top_spam_words: 
    top_ham_words.remove(word)
    top_spam_words.remove(word)

#Obtener palabras más comunes de todo el dataset

filtered_words = []
for text in data:
  filtered_words += filter_words_by_threshold(word_tokenize(text))
filtered_words
all_words = nltk.FreqDist([w for w in filtered_words])
top_words = all_words.most_common(200)
top_words

#Obtener los atributos

def document_attributes(document):
  document_words = set(document)
  atrib = {}
  for word in top_words:
    atrib['contains({})'.format(word)] = (word in document_words)
  
  for word in top_spam_words:
    atrib['contains_spam_word({})'.format(word)] = (word in document_words)

  for word in top_ham_words:
    atrib['contains_ham_word({})'.format(word)] = (word in document_words)

  
  for word in document_words:
    has_spam_word = False
    has_ham_word = False

    for bigram_position_0, bigram_position_1 in email_spam_collocations:
        if word == bigram_position_0 or word == bigram_position_1:
          has_spam_word = True
          break

    for bigram_position_0, bigram_position_1 in email_ham_collocations:
        if word == bigram_position_0 or word == bigram_position_1:
          has_ham_word = True
          break

      
    atrib['spam_word({})'.format(word)] = has_spam_word
    atrib['ham_word({})'.format(word)] = has_ham_word
    
    
  filtered_words = filter_words_by_threshold(document)
  bigrams = get_n_grams_collocations_from_words(filtered_words, n_best=10, freq_filter=5)

  for i in range(len(bigrams)):
    atrib['bigram_collocation({})'.format(i)] = bigrams[i]
    
  return atrib

#Separar dataset de prueba y entrenamiento

fset = [(document_attributes(text), labels) for text, labels in zip(dataframe['tokens'], dataframe['labels_num'].values)]
random.shuffle(fset)
print(len(fset))
train, test = fset[:13078], fset[13078:]

#Entrenar y calcular accuracy

classifier = nltk.NaiveBayesClassifier.train(train)
print(nltk.classify.accuracy(classifier, test))


[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
  inflating: corpus3/spam/2275.2004-12-22.BG.spam.txt  
  inflating: corpus3/spam/0687.2004-09-19.BG.spam.txt  
  inflating: corpus3/spam/1151.2004-10-21.BG.spam.txt  
  inflating: corpus3/spam/4203.2005-04-02.BG.spam.txt  
  inflating: corpus3/spam/4263.2005-04-06.BG.spam.txt  
  inflating: corpus3/spam/4731.2005-05-07.BG.spam.txt  
  inflating: corpus3/spam/3005.2005-01-25.BG.spam.txt  
  inflating: corpus3/spam/4744.2005-05-08.BG.spam.txt  
  inflating: corpus3/spam/1371.2004-11-02.BG.spam.txt  
  inflating: corpus3/spam/3408.2005-02-12.BG.spam.txt  
  inflating: corpus3/spam/4523.2005-04-24.BG.spam.txt  
  inflating: corpus3/spam/2991.2005-01-24.BG.spam.txt  
  inflating: corpus3/spam/4529.2005-04-24.BG.spam.txt  
  inflating: corpus3/spam/1376.2004-11-02.BG.spam.txt  
  inflating: corpus3/spam/2971.2005-01-23.BG.spam.txt  
  inflating: corpus3/spam/0767.2004-09-24.BG.spam.txt  
  inflating: corpus3/sp