# Librerías

In [92]:
import warnings
warnings.filterwarnings("ignore")

import os
from collections import Counter
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pprint
from nltk import FreqDist
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import gensim
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot, hashing_trick, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import HashingVectorizer
import lightgbm as lgb
from sklearn import pipeline
import gc
from tqdm import tqdm_notebook
import datetime
import dask.dataframe as dd

# Logging

In [None]:
import logging

if not os.path.exists('../logs/'):
    os.makedirs('../logs/')

NAME = 'Exploracion'
    
LOG_NAME = '../logs/{}_{}.log'.format(datetime.datetime.now().strftime("%Y%m%d"), NAME)
logging.basicConfig(filename=LOG_NAME, level=logging.WARNING, format='%(asctime)s %(message)s')

logging.warning("")
logging.warning("Comienzo script")

# Datos

In [69]:
train = pd.read_csv('../data/train.csv', usecols=['comment_text', 'target'], nrows=100000)

In [70]:
# train2 = dd.read_csv('../data/train.csv', usecols=['comment_text', 'target']).head(50000)

In [71]:
train.shape

(100000, 2)

In [72]:
train.head(10)

Unnamed: 0,target,comment_text
0,0.0,"This is so cool. It's like, 'would you want yo..."
1,0.0,Thank you!! This would make my life a lot less...
2,0.0,This is such an urgent design problem; kudos t...
3,0.0,Is this something I'll be able to install on m...
4,0.893617,haha you guys are a bunch of losers.
5,0.666667,ur a sh*tty comment.
6,0.457627,hahahahahahahahhha suck it.
7,0.0,FFFFUUUUUUUUUUUUUUU
8,0.0,The ranchers seem motivated by mostly by greed...
9,0.0,It was a great show. Not a combo I'd of expect...


In [73]:
text_data = train['comment_text']
text_data.head(10)

0    This is so cool. It's like, 'would you want yo...
1    Thank you!! This would make my life a lot less...
2    This is such an urgent design problem; kudos t...
3    Is this something I'll be able to install on m...
4                 haha you guys are a bunch of losers.
5                                 ur a sh*tty comment.
6                          hahahahahahahahhha suck it.
7                                  FFFFUUUUUUUUUUUUUUU
8    The ranchers seem motivated by mostly by greed...
9    It was a great show. Not a combo I'd of expect...
Name: comment_text, dtype: object

Conteo

In [74]:
%%time
count_ast = [phrase.count('*') for phrase in text_data]
count_ex = [phrase.count('!') for phrase in text_data]
count_qu = [phrase.count('?') for phrase in text_data]
len_pr = [len(phrase) for phrase in text_data]
len_max_word = [max([len(x) for x in phrase.split()]) for phrase in text_data]

CPU times: user 714 ms, sys: 39 µs, total: 714 ms
Wall time: 712 ms


In [75]:
data_extra = pd.DataFrame({'ast': count_ast,
                          'ex': count_ex,
                          'qu': count_qu,
                          'len_pr': len_pr,
                          'len_word': len_max_word})

data_extra.head(10)

Unnamed: 0,ast,ex,qu,len_pr,len_word
0,0,1,2,101,7
1,0,3,0,114,17
2,0,1,0,86,11
3,0,0,2,84,9
4,0,0,0,36,7
5,1,0,0,20,8
6,0,0,0,27,18
7,0,0,0,19,19
8,0,0,0,120,9
9,0,0,0,80,8


Data Cleaning

In [76]:
CHARS_TO_REMOVE = '!¡"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
MAX_LEN = 1000
stop_words = list(stopwords.words('english'))

In [77]:
def preprocess(data, sw):
    def clean_special_chars(text, sw):
        text = ' '.join([word for word in text.split() if word.lower() not in sw])
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, sw))
    return data

In [78]:
def preprocess2(data, chars):
    def clean_special_chars2(text, chars):
        text = ''.join([word for word in text if word not in chars])
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars2(x, chars))
    return data

In [79]:
%%time
text_data_clean = preprocess(text_data, stop_words)

CPU times: user 6.38 s, sys: 15.9 ms, total: 6.39 s
Wall time: 6.4 s


In [80]:
%%time
text_data_clean2 = preprocess2(text_data_clean, CHARS_TO_REMOVE)

CPU times: user 1.05 s, sys: 4.03 ms, total: 1.05 s
Wall time: 1.05 s


In [None]:
text_data_clean2[:5]

In [None]:
list_len = [len(x.split()) for x in text_data_clean2]
maximo = max(list_len)
print(maximo)

In [None]:
def preprocess3(data, m):
    def clean_special_chars3(text, m):
        text = one_hot(text, m)
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars3(x, m))
    return data

In [None]:
%%time
text_data_clean3 = preprocess3(text_data_clean2, maximo)

In [None]:
text_data_clean3[:5]

In [101]:
vectorizer = HashingVectorizer(n_features=2500)

In [93]:
vectorizer = TfidfVectorizer(max_features=2000).fit(text_data_clean2)

In [102]:
text_data_clean3 = vectorizer.transform(text_data_clean2).toarray()

In [103]:
X_words = pd.DataFrame(text_data_clean3)

In [104]:
X_train = pd.concat([X_words, data_extra], axis=1)
X_train.shape

(100000, 2505)

In [105]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2495,2496,2497,2498,2499,ast,ex,qu,len_pr,len_word
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1,2,101,7
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,3,0,114,17
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,86,11
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,2,84,9
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,36,7


In [None]:
tokenizer = Tokenizer(lower=True)

In [None]:
%%time
tokenizer.fit_on_texts(text_data_clean2)

In [None]:
%%time
X_train = tokenizer.texts_to_sequences(text_data_clean2)

In [None]:
%%time
X_train = pad_sequences(X_train, maxlen=maximo)

In [None]:
X_train = pd.DataFrame(X_train)

In [None]:
X_train.head(10)

In [None]:
# drop_cols = [i for i in range(MAX_LEN) if len(X_train.iloc[:,i].value_counts()) < 5000]

In [None]:
# drop_cols = list()
# for i in range(MAX_LEN):
#     lgth = len(X_train.iloc[:,i].value_counts())
#     if lgth == 1:
#         drop_cols.append(str(i))

In [None]:
# X_train.drop(drop_cols, axis=1, inplace=True)

In [112]:
X_train = X_train[ft_sel]
X_train.shape

(100000, 37)

In [113]:
y_train = np.where(train['target'] >= 0.5, 1, 0)

In [114]:
k = 3

train_ids = X_train.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

3

In [115]:
params={
    'min_data_in_leaf':20,
        'max_depth':-1,
        'metric':'auc',
        'n_estimators':1000,
        'learning_rate':0.07,
        'num_leaves':75,
        'colsample_bytree':0.3,
        'objective':'binary',
        'n_jobs':-1,
        'seed':42,
        'bagging_fraction':0.8,
        'lambda_l1':0,
        'lambda_l2':0}

In [116]:
lgb_model = lgb.LGBMClassifier(**params)

ft_importances = np.zeros(X_train.shape[1])

counter = 1
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train[train_index], y_train[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=50,
                  early_stopping_rounds=50)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

    ft_importances += lgb_model.feature_importances_

    counter += 1

Fold 1

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.772819
Early stopping, best iteration is:
[45]	valid_0's auc: 0.773137
Fold 2

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.767803
[100]	valid_0's auc: 0.769148
Early stopping, best iteration is:
[87]	valid_0's auc: 0.769967
Fold 3

Training until validation scores don't improve for 50 rounds.
[50]	valid_0's auc: 0.772458
Early stopping, best iteration is:
[45]	valid_0's auc: 0.772985


In [110]:
columnas = X_train.columns
imp = pd.DataFrame({'feature': columnas, 'importance': ft_importances/k})
df_imp_sort = imp.sort_values('importance', ascending=False)

df_imp_sort.head(30)

Unnamed: 0,feature,importance
2503,len_pr,148.333333
2504,len_word,120.666667
1021,1021,46.666667
1450,1450,42.666667
410,410,40.333333
1693,1693,40.0
1683,1683,38.0
2501,ex,37.333333
1108,1108,36.0
2003,2003,35.666667


In [None]:
df_imp_sort.tail(2)

In [117]:
ft_sel = list()
for i, j in zip(df_imp_sort['feature'], df_imp_sort['importance']):
    if j > 10:
        ft_sel.append(i)
        
ft_sel[:5]

['len_pr', 'len_word', 1021, 1450, 410]

In [45]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD
import tensorflow as tf
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [46]:
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [47]:
callbacks = [EarlyStopping(monitor='val_auroc',
                           min_delta=0.0,
                           patience=1,
                           verbose=0,
                           mode='max',
                           restore_best_weights=True)]

In [48]:
def create_baseline():
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(256, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1024, kernel_initializer='normal', activation='relu'))
    model.add(Dense(512, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=[auroc])
    return model

In [49]:
estimator = KerasClassifier(build_fn=create_baseline, epochs=30, batch_size=1024, verbose=1, validation_split=0.33,
                           callbacks=callbacks)

In [50]:
model = create_baseline()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, use
    tf.py_function, which takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    


In [51]:
estimator.fit(X_train, y_train)

Instructions for updating:
Use tf.cast instead.
Train on 33500 samples, validate on 16500 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30


<keras.callbacks.History at 0x7f783120cf98>

In [None]:
vectorizer = HashingVectorizer(stop_words='english', strip_accents='unicode')
analyzer = vectorizer.build_analyzer()

def tokenize_corpus(td, mode='d'):
    for t in td:
        tokens = analyzer(t)
        if mode == 'd':
            yield tokens
        else:
            for token in tokens:
                yield token

In [None]:
%%time
tokens = tokenize_corpus(text_data_clean, mode='t')
dist = FreqDist(tokens)

In [None]:
mc_words = dict(dist.most_common(200))

In [None]:
%%time
X_train = list()
k = mc_words.keys()
for phrase in text_data:
    line = [1 if word in phrase else 0 for word in k]
    X_train.append(np.asarray(line))
X_train = np.asarray(X_train)

In [None]:
X_train = pd.DataFrame(X_train, index=None)

In [None]:
y_train = np.where(train['target'] >= 0.5, 1, 0)

In [None]:
# del text_data
# del tokens
# del dist
# del mc_words
# gc.collect()

In [None]:
k = 3

train_ids = X_train.index
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
skf.get_n_splits(train_ids, y_train)

In [None]:
params={'min_data_in_leaf':20,
        'max_depth':-1,
        'metric':'auc',
        'n_estimators':1000,
        'learning_rate':0.1,
        'num_leaves':75,
        'colsample_bytree':1,
        'objective':'binary',
        'n_jobs':-1,
        'seed':42,
        'bagging_fraction':1,
        'lambda_l1':0,
        'lambda_l2':0}

In [None]:
lgb_model = lgb.LGBMClassifier(**params)

ft_importances = np.zeros(X_train.shape[1])

counter = 1
for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}\n'.format(counter))

    X_fit, X_val = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_fit, y_val = y_train[train_index], y_train[test_index]

    lgb_model.fit(X_fit,
                  y_fit,
                  eval_set=[(X_val, y_val)],
                  verbose=10,
                  early_stopping_rounds=20)

    del X_fit
    del X_val
    del y_fit
    del y_val
    del train_index
    del test_index
    gc.collect()

    ft_importances += lgb_model.feature_importances_

    counter += 1

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(
    strip_accents = 'unicode',
    lowercase = True,
    analyzer='word',
    stop_words = 'english',
    ngram_range = (1,2)
)

In [None]:
model = pipeline.Pipeline([
    ('vectorizer', vectorizer),
    ('LGBM', lgb.LGBMClassifier(metric='auc', e))
])

## Estadísticas de Texto

En este apartado vamos a analizar con ayuda de NLTK como es la distribución del texto a lo largo de todo el corpus. Estaremos interesados en caracteríticas como tokens más frecuentes, longitud del corpus, longitud del vocabulario, etcétera.

In [None]:
vectorizer = HashingVectorizer(stop_words='english', strip_accents='unicode')
analyzer = vectorizer.build_analyzer()

def tokenize_corpus(mode='d'):
    for t in text_data:
        tokens = analyzer(t)
        if mode == 'd':
            yield tokens
        else:
            for token in tokens:
                yield token

### Distribución de Frecuencias

La mayoría de herramientas que trabajan con texto en python (NLTK, gensim, Scikit Learn...) necesitan manejar una estructura de datos en la que se implementa una distribución de frecuencias que da lugar a una representación conocida como **Bag of Words (BoW)** en la que simplemente, por cada documento o a nivel global del corpus, se mantiene un contador con el número de apariciones de cada palabra o token

In [None]:
%%time
tokens = tokenize_corpus(mode='t')
dist = FreqDist(tokens)

In [None]:
dist

In [None]:
mc_words = dict(dist.most_common(200))

In [None]:
text_data.shape[0]

In [None]:
%%time
X = list()
for phrase in text_data:
    line = list()
    for word in mc_words.keys():
        if word in phrase:
            line.append(1)
        else:
            line.append(0)
    X.append(line)

In [None]:
df = pd.DataFrame(dist.most_common(100))
df.columns = ['Token', 'Frecuencia']
df.sort_values('Frecuencia')
df.head(20)

In [None]:
df = pd.DataFrame(dist.most_common()[-100:])
df.columns = ['Token', 'Frecuencia']
df.sort_values('Frecuencia')
df.head(20)

# Diseñando Nuestro Modelo

## Diccionario

Nuestro modelo de tópicos estará basado en una representación BoW del corpus. Únicamente tendremos en cuenta la frecuencia global de los términos y no una frecuencia de documentos tipo TF-IDF. Lo primero que necesitamos construir es un diccionario con nuestro vocabulario. Empezaremos con un vocabulario sin filtros, para comprobar que resultamos obtenemos y si nuestro estudio previo ha tenido sentido a la hora de ayudarnos con el filtrado posterior.

Empezamos a utilizar gensim para construir el diccionario. 

In [None]:
stream = tokenize_corpus()
%time dictionary = gensim.corpora.Dictionary(stream)
dictionary.save('../data/original.dict')

In [None]:
data = [[dictionary.num_docs, dictionary.num_pos, len(dictionary.token2id)]]
df = pd.DataFrame(data)
df.columns=['Numero de frases analizadas', 'Numero de tokens analizados', 'Numero de tokens únicos actuales']
df.head()

## Corpus

Necesitamos declarar un iterable para acceder en streaming a la representación BoW de cada uno de nuestros documentos (comentarios). Este iterable será utilizado de forma eficiente por gensim para entrenar el modelo de forma iterativa en un número determinado de pasadas.

In [None]:
class MovieCorpus(object):

    def __init__(self, path, dictionary):
        self.__path = path
        self.__dictionary = dictionary

    def __iter__(self):
        for tokens in tokenize_corpus(self.__path):
            yield self.__dictionary.doc2bow(tokens)

    def __len__(self):
        return len(self.__dictionary)

In [None]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

def print_lda_model(lda_model, num_topics=20):
    topic_summaries = []
    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
    for i in range(num_topics):
        print('\n')
        print('Topic '+str(i)+' |---------------------\n')
        tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
        topic_summaries += [tmp[:5]]
        print

In [None]:
tokens_list = [x for x in dist.keys()]

In [None]:
common_dictionary = Dictionary(text_data.values)
common_corpus = [common_dictionary.doc2bow(text) for text in tokens_list.decode()]

In [None]:
# dictionary = gensim.corpora.Dictionary.load('../data/original.dict')
# corpus = MovieCorpus('../data/original.dict', text_data)
# gensim.corpora.MmCorpus.serialize('../data/corpus.mm', corpus)
# corpus = gensim.corpora.MmCorpus('../data/corpus.mm')
%time lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary)

In [None]:
print_lda_model(lda_model)

In [None]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

## Evaluando Modelo LDA

Necesitamos poder comparar los distintos modelos que vamos a ir generando para poder comprobar que van mejorando con las acciones que tomamos. Existen muchas diversas formas de evaluar un modelo LDA, cualquiera compatible con evaluar clusters procedentes de algoritmos de clustering.

Los clusters se suelen evaluar midiendo la coherencia de sus componentes. En nuestro caso concreto, cada tópico tendrá mayor calidad si:

* Los documentos dominados por los mismos tópicos han de ser similares entre si
* Los documentos dominados por tópicos diferentes y poco solapados han de ser distintos entre si

Afortunadamente gensim proporciona sus propias herramientas para medir la coherencia que usamos a continuación.

In [None]:
from gensim.models.coherencemodel import CoherenceModel
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
cm.get_coherence_per_topic()

## Filtrando Tokens Frecuentes

In [None]:
mc100 = [mc[0] for mc in dist.most_common(100)]
terms_id = lda_model.get_topic_terms(2)
terms_str = [dictionary.id2token[id[0]] for id in terms_id if id[0] in dictionary.id2token]
list(set(mc100) & set(terms_str))

In [None]:
def dictionary_filter_most_frequent(dictionary, dist, n=200):
    most_common = dist.most_common(n)
    mc_ids = [dictionary.token2id[t[0]] for t in most_common]
    dictionary.filter_tokens(bad_ids=mc_ids)
    dictionary.compactify()

In [None]:
print('Longitud del vocabulario actual: {}'.format(len(dictionary.token2id)))
dictionary_filter_most_frequent(dictionary, dist)
# Filter out words that occur less than 10 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.5)
print('Longitud del vocabulario Filtrado: {}'.format(len(dictionary.token2id)))

## Normalizando Tokens

Además del lowercase, vamos a eliminar también los plurales

In [None]:
def normalize_dictionary(dictionary):
    from textblob import Word
    plurals = []
    for token in dictionary.values():
        if token.endswith('s'):
            singular = Word(token).singularize()
            if token != singular:
                singular_id = dictionary.token2id.get(singular, None)
                if singular_id:
                    plurals.append(dictionary.token2id[token])
                    
    dictionary.filter_tokens(bad_ids=plurals)
    dictionary.compactify()
    return plurals

In [None]:
print('Numero de tokens únicos actuales: {}'.format(len(dictionary.token2id)))
plurals = normalize_dictionary(dictionary)
print('Numero de plurales detectados: {}'.format(len(plurals)))
print('Numero de tokens únicos actuales: {}'.format(len(dictionary.token2id)))

In [None]:
dictionary.save('normalized.dict')

In [None]:
dictionary = gensim.corpora.Dictionary.load('normalized.dict')
corpus = MovieCorpus("./resources/aclImdb/all", dictionary)
gensim.corpora.MmCorpus.serialize('corpus1.mm', corpus)
corpus = gensim.corpora.MmCorpus('corpus1.mm')
%time lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
cm.get_coherence_per_topic()

## Limitando el Vocabulario por Frecuencia

In [None]:
def dictionary_keep_n_frequent(dictionary, dist, n=5000):
    tokens_by_freq = dist.most_common(len(dist))
    mf = []
    for t in tokens_by_freq:
        id = dictionary.token2id.get(t[0], None)
        if id:
            mf.append(id)
            if len(mf) == n:
                break
    dictionary.filter_tokens(good_ids=mf)

#### 10 Topics

In [None]:
dictionary = gensim.corpora.Dictionary.load('normalized.dict')
dictionary_keep_n_frequent(dictionary, dist)
corpus = MovieCorpus("./resources/aclImdb/all", dictionary)
gensim.corpora.MmCorpus.serialize('corpus3.mm', corpus)
corpus = gensim.corpora.MmCorpus('corpus3.mm')
%time lda_model= gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary)
print_lda_model(lda_model, 10)

#### 20 Topics

In [None]:
%time lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary)
print_lda_model(lda_model, 20)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
cm.get_coherence_per_topic()

#### 50 topics

In [None]:
%time lda_model= gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word=dictionary)
print_lda_model(lda_model, 50)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
cm.get_coherence_per_topic()

## Limite de Reviews por Película

Un primer filtro que podemos aplicar para evitar bias es un límite sobre el número de revies para una misma película. Usaremos un parámetro configurable con un valor inicial de 10 después de estudiar la primera gráfica

In [None]:
ids_by_path = {}
for urls_file in walk_corpus('./resources/aclImdb/all/', 'urls.urls'):
    dirname = os.path.dirname(urls_file)
    with open(urls_file) as f:
            ids_map = {}
            lines = f.readlines()
            for index, line in enumerate(lines):
                movie_id = id_pattern.search(line).group(1)
                ids_map[index] = movie_id
            ids_by_path[dirname] = ids_map
            
line_id_pattern = re.compile('([0-9]+)_[0-9]+')

In [None]:
def tokenize_corpus(path, pattern, min_df=1, mode='d', limit=10):
    movie_counter = Counter()

    for corpus_file in walk_corpus(path, pattern):
        dirname = os.path.dirname(corpus_file)
        line_id = int(line_id_pattern.search(corpus_file).group(1))
        ids_map = ids_by_path[dirname]
        movie_id = ids_map[line_id]
        if movie_counter[movie_id] <= limit:
            movie_counter[movie_id] += 1
            with open(corpus_file, 'r') as next_file:
                next_review = next_file.read()
                tokens = analyzer(next_review)
                if mode == 'd':
                    yield tokens
                else:
                    for token in tokens:
                        yield token

In [None]:
%time dist_limited = FreqDist(tokenize_corpus('./resources/aclImdb/all/', '*.txt', mode='t'))
print(dist_limited)
pp.pprint(dist_limited.most_common(100))

In [None]:
%time dictionary = gensim.corpora.Dictionary(tokenize_corpus('./resources/aclImdb/all/', '*.txt'))
data = [[dictionary.num_docs, dictionary.num_pos, len(dictionary.token2id)]]
df = pd.DataFrame(data)
df.columns=['Numero de reviews analizadas', 'Numero de tokens analizados', 'Numero de tokens únicos actuales']
df.head()

In [None]:
dictionary.save('limited.dict')

### Aplicando normalización y filtrado de vocabulario

In [None]:
dictionary_filter_most_frequent(dictionary, dist_limited)
dictionary.filter_extremes(no_below=10, no_above=0.5)
plurals = normalize_dictionary(dictionary)
dictionary.save('limited.normalized.dict')
corpus = MovieCorpus("./resources/aclImdb/all", dictionary)
gensim.corpora.MmCorpus.serialize('corpus4.mm', corpus)
corpus = gensim.corpora.MmCorpus('corpus4.mm')
%time lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary)
print_lda_model(lda_model)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
cm.get_coherence_per_topic()

In [None]:
dictionary_keep_n_frequent(dictionary, dist_limited)
corpus = MovieCorpus("./resources/aclImdb/all", dictionary)
gensim.corpora.MmCorpus.serialize('corpus5.mm', corpus)
corpus = gensim.corpora.MmCorpus('corpus5.mm')
%time lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary)
lda_model.save('limited.normalized.filtered.model')
print_lda_model(lda_model)

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
cm.get_coherence_per_topic()

## Filtrando Vocabulario Polarizado

En todos los modelos anteriores hemos visto que existen varios tipos de palabras que concurrente aparecen con bastante frecuencia en varios tópicos, pero que aportan escaso valor a la hora de categorizar por temáticas. Algunos ejemplos de estos tipos de palabras son nombres propios y verbos que podemos filtrar

In [None]:
def dictionary_filter_neutral(dictionary, polarity=0.5):
    from textblob import TextBlob
    neutrals = []
    for token in dictionary.values():
        if len(token) > 1:
            upper = token[0].upper() + token[1:]
        blob = TextBlob(upper)
        if abs(blob.polarity) <= polarity and blob.pos_tags[0][1] != 'NNP' and not blob.pos_tags[0][1].startswith('VB'):
            neutrals.append(dictionary.token2id[token])
                    
    dictionary.filter_tokens(good_ids=neutrals)
    dictionary.compactify()
    return neutrals

In [None]:
dictionary = gensim.corpora.Dictionary.load('normalized.dict')
print('Número de palabras iniciales: {}'.format(len(dictionary)))
neutrals = dictionary_filter_neutral(dictionary, 0.0)
print("Número de palabras neutrales: {}".format(len(neutrals)))

In [None]:
dictionary_keep_n_frequent(dictionary, dist)
dictionary.save('final.dict')
corpus = MovieCorpus("./resources/aclImdb/all", dictionary)
gensim.corpora.MmCorpus.serialize('corpus6.mm', corpus)
corpus = gensim.corpora.MmCorpus('corpus6.mm')
%time lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word=dictionary)
print_lda_model(lda_model)

In [None]:
lda_model.save('neutral.model')

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

In [None]:
cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
cm.get_coherence_per_topic()

## Probando otros Modelos de Representación: TF-IDF

In [None]:
dictionary = gensim.corpora.Dictionary.load('normalized.dict')
dictionary_keep_n_frequent(dictionary, dist)
corpus = MovieCorpus("./resources/aclImdb/all", dictionary)
tfidf = gensim.models.TfidfModel(corpus)
gensim.corpora.MmCorpus.serialize('corpus7.mm', corpus)
corpus = gensim.corpora.MmCorpus('corpus7.mm')
%time lda_model_tfidf = gensim.models.ldamodel.LdaModel(tfidf[corpus], num_topics=20, id2word=dictionary)
pp.pprint(lda_model_tfidf.print_topics(20))

# Usando Nuestro Modelo como Profiler

In [None]:
import requests
import json
r = requests.get("http://www.omdbapi.com/?i=tt0379889&apikey=ccedfaeb")
pp.pprint(r.json())

## Analizando una Review Positiva

Para analizar las reviews, primero vamos a tokenizar el texto y lo vamos a convertir en una representación Bag of Words con proyección a nuestro diccionario. Esta representación es la que podemos pasar a nuestro modelo LDA para que nos devuelva la distribución de tópicos más probable sobre nuestro texto inicial

In [None]:
dictionary = gensim.corpora.Dictionary.load('final.dict')
good_review_text = """I just saw this at the Toronto International Film Festival in the beautiful Elgin Theatre. 
I was blown away by the beautiful cinematography, the brilliant adaptation of a very tricky play and last 
but not least, the bravura performance of Al Pacino, who was born to play this role, 
which was perfectly balanced by an equally strong performance from Jeremy Irons.<br /><br />
The film deftly explores the themes of love vs loyalty, law vs justice, and passion vs reason. 
Some might protest that the content is inherently anti-semitic, 
however they should consider the historical context of the story, 
and the delicate and nuanced way in which it is told in this adaptation"""
good_review_tokens = analyzer(good_review_text)
lda_model.get_document_topics(dictionary.doc2bow(good_review_tokens))

Comprobamos cuales son los 10 tokens más prominentes del tópico asignado con más probabilidad, el **tópico 11**

In [None]:
def get_topic_tokens(model, topic_id, n_tokens=10):
    terms = model.show_topic(topic_id, n_tokens)
    return [item[0] for item in terms]
tokens = get_topic_tokens(lda_model, 6, 20)
tokens

In [None]:
shared_tokens = list(set(good_review_tokens) & set(tokens))

### Análisis de Sentimiento sobre los Keywords de los Tópicos

In [None]:
pd.options.display.max_colwidth = -1
from IPython.display import display, HTML

def explore_opinions(text, keywords):
    from textblob import TextBlob
    blob = TextBlob(text)
    data = []
    for sentence in blob.sentences:
        for token in keywords:
            if token in sentence.words:
                data.append([token, sentence.__str__(), sentence.sentiment[0], sentence.sentiment[1]])
                
    df = pd.DataFrame(data)
    df.columns = ['Token', 'Sentence', 'Sentiment Polarity', 'Sentiment Subjectivity']
    return df
        
display(HTML(explore_opinions(good_review_text, shared_tokens).to_html().replace("\\n","<br>").replace('adaptation', '<strong>adaptation</strong>')))

## Analizando una Review Negativa

In [None]:
bad_review_text = """I have to admit that although I'm a fan of Shakespeare, 
I was never really familiar with this play. And what I really can't say is whether this is a poor adaptation, 
or whether the play is just a bad choice for film. 
There are some nice pieces of business in it, but the execution is very clunky and the plot is obvious. 
The theme of the play is on the nature of debt, using the financial idea of debt and justice as a 
metaphor for emotional questions. That becomes clear when the issue of the rings becomes more important than 
the business with Shylock, which unfortunately descends into garden variety anti-Semitisim despite 
the Bard's best attempts to salvage him with a couple nice monologues.<br /><br />
Outside of Jeremy Irons' dignified turn, I didn't think there was a decent performance in the bunch. 
Pacino's Yiddish consists of a slight whine added to the end of every pronouncement, and 
some of the better Shylock scenes are reduced to variations on the standard "Pacino gets angry" 
scene that his fans know and love. But Lynn Collins is outright embarrassing, to the point where I 
would have thought they would have screen-tested her right out of the picture early on. 
When she goes incognito as a man, it's hard not to laugh at all the things we're not supposed to laugh at. 
With Joseph Fiennes standing there trying to look sincere and complicated, it's hard not to make 
devastating comparisons to Gwyneth Paltrow's performance in "Shakespeare in Love." 
The big problem however that over-rides everything in this film is just a lack of emotional focus. 
It's really hard to tell whether this film is trying to be a somewhat serious comedy or a strangely silly drama. 
Surely a good summer stock performance would wring more laughs from the material than this somber production. 
The actors seem embarrassed to be attempting humor, and unsure of where to place dramatic and comedic emphasis. 
All of this is basically the fault of the director, Michael Radford, who seems to think that the material 
is a great deal heavier than it appears to me."""
bad_review_tokens = analyzer(bad_review_text)
lda_model.get_document_topics(dictionary.doc2bow(bad_review_tokens))
list(set(bad_review_tokens) & set(tokens))

In [None]:
display(HTML(explore_opinions(bad_review_text, shared_tokens).to_html().replace("\\n","<br>").replace('adaptation', '<strong>adaptation</strong>')))

## Analizando Reviews Nuevas fuera del Corpus

In [None]:
bb_text = """Drug wars, meth, the lot. I thought no thank you. 
I kept hearing how good it was and I kept saying: "No thank you" 
Last January I got sick, one of those illnesses you can't quite figure out. 
Maybe it was pre and post election depression, I don't know. But I stayed in bed for almost 
10 days and then it happened. I saw the first episode and I was immediately and I mean immediately, 
hooked. I saw the entire series in 9 days. Voraciously. Now I had time to reflect. Why I wonder. 
When I think about it the first thing that comes to mind is not a thing it's Bryan Cranston. 
I know the concept was superb as was the writing but Bryan Cranston made it all real. 
His performance, the creation of Walter White will be studied in the Acting classes of the future. 
He is the one that pulls you forward - as well as backwards and sideways - then I realized that his 
creation acquired the power that it acquired, in great part thanks to the extraordinary cast of supporting players. 
I could write a page for each one of them but I'm just going to mention Aaron Paul. 
I ended up loving him. I developed a visceral need to see him find a way out. Well, what can I tell you. 
I know that one day, maybe when my kids are old enough, I shall see "Breaking Bad" again. I can't wait."""
bb_review_tokens = analyzer(bb_text)
lda_model.get_document_topics(dictionary.doc2bow(bb_review_tokens))

In [None]:
lda_model.show_topic(16)

In [None]:
bb_text_2 = """What do you get when you have a chemistry teacher in a mid life crisis, dying of cancer, 
and washing cars as a second job to make ends meet for his middle class family? One of the greatest television 
dramas of all time with crazy plot twists, brilliant performances, and unforgettable characters and cinematography.
There is so much to like about the masterpiece that is Breaking Bad. Take your pick: the acting, 
the writing, the story lines, the plot, the suspense the cliff hangers, the action scenes, the camera work, 
the characters, the character arcs, the realism, the satirical style, any season, the end, the casting, the 
dark humor and humor relief, the scenery, the contrast between background and foreground to establish artistic 
effect (the sun shiny clear blue skies of the NM desert behind the gruesome organized crime and violence of the 
underworld), the mixing of favorite genres (crime caper, dark comedy, western, noir, horror, suspense, action, 
drama, thriller, Shakespearean tragedy, dystopia, psychological character study..), the lines/quotes...
the list goes on.
What's amazing about Breaking Bad is it begins so humble and quiet, and as it continues to let its' story unfold,
it explodes. It gets better and better each season until the end in the final season, we don't know if we're watching a
television show or an Academy Award winning motion picture. The show dares to go where no one would have thought 
it would go- into a transcendent realm of classic cinema- and it pulls it off beautifully."""
bb_review_tokens = analyzer(bb_text_2)
lda_model.get_document_topics(lda_model.id2word.doc2bow(bb_review_tokens))

In [None]:
print(list(set(bb_review_tokens) & set(get_topic_tokens(lda_model, 2))))
lda_model.show_topic(10)

In [None]:
print(list(set(bb_review_tokens) & set(get_topic_tokens(lda_model, 4))))
lda_model.show_topic(9)

In [None]:
print(list(set(bb_review_tokens) & set(get_topic_tokens(lda_model, 18))))
lda_model.show_topic(1)