In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import time
import csv
import os
import string
import textblob
import io
import nltk

import sklearn.cluster as cluster
import hdbscan # https://hdbscan.readthedocs.io/en/latest/parameter_selection.html

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, cross_val_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC

from sklearn.decomposition import PCA
from sklearn.decomposition import RandomizedPCA

from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_curve, auc
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, silhouette_score
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity

from sklearn import preprocessing
from sklearn import decomposition, ensemble

# from tensorflow.python.framework import ops
# ops.reset_default_graph()

%matplotlib inline
# sns.set_context('poster')
# sns.set_color_codes()


### Preprocessing

In [None]:
from text_processing import *

In [None]:
# cluster.birch

### Data
- inspect data (ex: df.head(), df.groupby('labels').count())
- select data source

In [None]:
with open(full_path_data_store +'df_datasets.pickle', 'rb') as handle:
    df_datasets = pickle.load(handle)

In [None]:
# split dataframe into text and labels
# df = df_merged
def pre_split_data(df):
    texts = df.texts
    labels = df.labels
    processed_texts = df.processed_text
    return  texts,labels,processed_texts


texts,labels,processed_texts=pre_split_data(df_datasets[df_name_])

In [None]:
# UNPROCESSED: raw text, split the dataset into training and validation datasets 
train_x, valid_x, train_y_text_label, valid_y_text_label = train_test_split(texts, labels)

# PROCESSED: preprocessed text, split the dataset into training and validation datasets 
train_xp, valid_xp, train_yp_text_label, valid_yp_text_label = train_test_split(processed_texts, labels)

# label encode the target variable for raw and processed datasets
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y_text_label)
valid_y = encoder.fit_transform(valid_y_text_label)
train_yp = encoder.fit_transform(train_yp_text_label)
valid_yp = encoder.fit_transform(valid_yp_text_label)

### Binarize data labels
- for binary classification problem

In [None]:
# run if binary experiment

binary=input('enter 1 to binarize ')

def binarize():
    
    train_y = np.array((pd.DataFrame({'t_label':train_y}).t_label == 1).astype('int'))
    valid_y = np.array((pd.DataFrame({'t_label':valid_y}).t_label == 1).astype('int'))
    train_yp = np.array((pd.DataFrame({'t_label':train_yp}).t_label == 1).astype('int'))
    valid_yp = np.array((pd.DataFrame({'t_label':valid_yp}).t_label == 1).astype('int'))
    
    return train_y,valid_y,train_yp,valid_yp

if binary==1:
    train_y,valid_y,train_yp,valid_yp=binarize()

## Feature Engineering

- https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

### Hashing Vectors

In [None]:
vectorizer = HashingVectorizer(n_features=20, ngram_range=(1,1), tokenizer=None, 
                               lowercase=True, stop_words='english',analyzer= "word", norm='l2')
xtrain_vector = vectorizer.fit_transform(train_x)
# print(vector.shape)
# print(vector.toarray())

### Count Vectors

In [None]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(texts)

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

### TF-IDF Vector Space

#### Tfidf vectorizer:
- Strips out “stop words”
- Filters out terms that occur in more than half of the docs (max_df=0.5)
- Filters out terms that occur in only one document (min_df=2).
- Selects the 10,000 most frequently occuring words in the corpus.
- Normalizes the vector (L2 norm of 1.0) to normalize the effect of document length on the tf-idf values. 

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, 
                             stop_words='english', use_idf=True, analyzer='word')
tfidf_vect.fit(texts)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# token_pattern=r'\w{1,}', 

In [None]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, 
                             stop_words='english', use_idf=True, analyzer='word', ngram_range=(2,3))
tfidf_vect_ngram.fit(texts)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [None]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, 
                             stop_words='english', use_idf=True, analyzer='char', ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(texts)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

### Topic Models
- https://radimrehurek.com/gensim/models/ldamulticore.html
- from gensim.test.utils import common_corpus, common_dictionary
- lda = LdaMulticore(common_corpus, id2word=common_dictionary, num_topics=10)

#### LDA version 1

In [None]:
from sklearn import decomposition, ensemble
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

#### LDA version 2

In [None]:
# LDA
# https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

from gensim import corpora, models, similarities 

#remove proper names
preprocess = [strip_proppers_POS(doc) for doc in texts]
# %time tokenized_text = [tokenizer(text) for text in preprocess]

%time texts = [[word for word in text if word not in stopwords] for text in preprocess]

In [None]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in texts]
c lda = models.LdaModel(corpus, num_topics=20, id2word=dictionary, update_every=5, chunksize=10000, passes=100)
lda.print_topics(20, num_words=20)
lda.show_topics(formatted=False, num_words=20)
topics_matrix = lda.show_topics(formatted=False, num_words=20)
lda.get_topics().shape

### Clustering
- multi classification problem without labels

In [None]:
def cluster_score(cluster_labels,labels=train_y):
    print("Homogeneity: {:.2f}".format(homogeneity_score(labels, cluster_labels)))
    print("Completeness: %0.3f" % completeness_score(labels, cluster_labels))
    print("V-measure: %0.3f" % v_measure_score(labels, cluster_labels))
    print("Adjusted Rand-Index: %.3f"
      % adjusted_rand_score(labels, cluster_labels))
    print("Silhouette Coefficient: %0.3f"
      % silhouette_score(data, cluster_labels, sample_size=1000))
    print()

In [None]:
# Compute cosine similarity between documents
%time dist = 1 - cosine_similarity(xtrain_tfidf)

#### Dimensionality reduction

- Step 1

In [None]:
# Contrary to PCA, SVD estimator does not center the data before computing the singular value decomposition. 
# This means it can work with scipy.sparse matrices efficiently
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20, n_iter=7, random_state=42)
%time pos_svd = svd.fit_transform(dist)  

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import RandomizedPCA

pca = RandomizedPCA(n_components=20)
pos_pca = pca.fit_transform(dist)
# x_pca, y_pca = pos_pca[:,0],pos_pca[:,1]

- Step 2

In [None]:
from sklearn.manifold import TSNE
%time X_embedded = TSNE(n_components=2).fit_transform(pos_svd)

In [None]:
# MDS is memory consumptive, and takes time if distance matrix is greater than 700x700
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

# x_mds, y_mds = pos[:, 0], pos[:, 1]
# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.

In [None]:
X_embedded.shape
train_y.shape

In [None]:
from sklearn.preprocessing import Normalizer
A = Normalizer().fit_transform(pos_svd)

#### Visualize

In [1]:
# https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html
# data =  A
import sklearn.cluster as cluster


plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)
    cluster_score(labels,train_y)
    print(classification_report(labels,train_y))


In [None]:
'''
gcv = GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=50, shuffle=True),
       error_score='raise',
       estimator=cluster.KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_iter': [100, 200, 300, 400, 500], 'n_init': [10, 15, 20], 'tol': [1e-07, 1e-06, 1e-05, 0.0001], 'n_clusters': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)
gcv.fit(data)     
'''

In [2]:
plot_clusters(X_topics, cluster.KMeans, (), {'n_clusters':16})

NameError: name 'X_topics' is not defined

In [None]:
plot_clusters(data, cluster.MiniBatchKMeans, (), {'n_clusters':2, 'init':'k-means++', 'n_init':1})

In [None]:
plot_clusters(data, cluster.AffinityPropagation, (), {'preference':-5.0, 'damping':0.95})

In [None]:
plot_clusters(data, cluster.MeanShift, (0.175,), {'cluster_all':False})

In [None]:
plot_clusters(data, cluster.SpectralClustering, (), {'n_clusters':2})

In [None]:
plot_clusters(data, cluster.AgglomerativeClustering, (), {'n_clusters':16, 'linkage':'ward'})

In [None]:
plot_clusters(data, cluster.DBSCAN, (), {'eps':0.05})

In [None]:
plot_clusters(data, hdbscan.HDBSCAN, (), {'min_cluster_size':20, 'min_samples':25})

# prediction:
#test_labels, strengths = hdbscan.approximate_predict(clusterer, test_points)
#test_labels

In [None]:
from clustering_functions import *
from text_processing import *

In [None]:
def get_similarity_matrix(content_as_str):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2,
                                       stop_words='english',use_idf=True,
                                       tokenizer=tokenizer, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(content_as_str) #fit the vectorizer to synopses
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return (similarity_matrix, tfidf_matrix)

In [None]:
B,tfidf_matrix = get_similarity_matrix(train_x[:2000])

In [None]:
import lda

In [None]:
# https://shuaiw.github.io/2016/12/22/topic-modeling-and-tsne-visualzation.html
    
from sklearn.datasets import fetch_20newsgroups

# we only want to keep the body of the documents!
remove = ('headers', 'footers', 'quotes')

# fetch train and test data
newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)

# a list of 18,846 cleaned news in string format
# only keep letters & make them all lower case
news = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in
        newsgroups_train.data + newsgroups_test.data]

In [None]:
# from sklearn import decomposition, ensemble
# train a LDA Model

from sklearn.feature_extraction.text import CountVectorizer

n_topics = 20 # number of topics
n_iter = 500 # number of iterations

# vectorizer: ignore English stopwords & words that occur less than 5 times
cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = cvectorizer.fit_transform(news)

# train an LDA model
# lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

In [None]:
_lda_keys = []
for i in range(X_topics.shape[0]):
      _lda_keys.append(X_topics[i].argmax())

In [None]:
import numpy

threshold = 0.5
_idx = np.amax(X_topics, axis=1) > threshold  # idx of doc that above the threshold
X_topics = X_topics[_idx]

In [None]:
from sklearn.manifold import TSNE

# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(X_topics)

In [None]:
import numpy as np
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

In [None]:
'''
topic_words={}
for topic, comp in enumerate(lda_model.components_):
    # for the n-dimensional array "arr":
    # argsort() returns a ranked n-dimensional array of arr, call it "ranked_array"
    # which contains the indices that would sort arr in a descending fashion
    # for the ith element in ranked_array, ranked_array[i] represents the index of the
    # element in arr that should be at the ith index in ranked_array
    # ex. arr = [3,7,1,0,3,6]
    # np.argsort(arr) -> [3, 2, 0, 4, 5, 1]
    # word_idx contains the indices in "topic" of the top num_top_words most relevant
    # to a given topic ... it is sorted ascending to begin with and then reversed (desc. now)    
    word_idx = np.argsort(comp)[::-1][:n_top_words]

    # store the words most relevant to the topic
    topic_words[topic] = [X_topics[i] for i in word_idx]
'''

In [None]:
topic_summaries = []
topic_word = lda_model.topic_word_  # all topic words # np.array(list(topic_words.keys())) #
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [None]:
from bokeh.plotting import figure, output_file, show


In [None]:
title = '20 newsgroups LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example])

source=bp.ColumnDataSource(data={
                   "content": news[:num_example],
                   "topic_key": _lda_keys[:num_example],
                   })

plot_lda.circle(x='content',y='topic_key', source=source)



In [None]:
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
    if not np.isnan(topic_coord).any():
        break
    topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(X_topics.shape[1]):
    plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"topic_key": "@content - topic: @topic_key"}




# save the plot
save(plot_lda, '{}.html'.format(title))

In [None]:
from bokeh.io import output_notebook

output_notebook()
show(plot_lda)

In [None]:
from IPython.display import HTML

HTML(filename='/home/silas/final_project/W266-final-project/20 newsgroups LDA viz.html')

In [None]:
title = '20 newsgroups LDA viz'
num_example = len(X_topics)

plot_lda = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example],
                 source=bp.ColumnDataSource({
                   "content": news[:num_example],
                   "topic_key": _lda_keys[:num_example]
                   }))

In [None]:
# randomly choose a news (within a topic) coordinate as the crucial words coordinate
topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in xrange(X_topics.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# save the plot
save(plot_lda, '{}.html'.format(title))