In [None]:
## Imports (code & data)
import re
import pandas as pd
import yake_helper_funcs as yhf
from datetime import datetime, timedelta
from math import sqrt, floor
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
import numpy as np
import itertools
from matplotlib import pyplot as plt
import removing_polite_posts as rpp
from flashtext.keyword import KeywordProcessor
import string
import nltk
import math

forum_posts = pd.read_csv("../input/meta-kaggle/ForumMessages.csv")

# read in pre-tuned vectors
vectors = pd.read_csv("../input/fine-tuning-word2vec-2-0/kaggle_word2vec.model", 
                      delim_whitespace=True,
                      skiprows=[0], 
                      header=None
                     )

# set words as index rather than first column
vectors.index = vectors[0]
vectors.drop(0, axis=1, inplace=True)
print(forum_posts.head())

In [None]:
!pip install yake

In [None]:
## Utility functions

# get vectors for each word in post
# TODO: can we vectorize this?
def vectors_from_post(post):
    all_words = [] 

    for words in post:
        all_words.append(words) 
        
    return(vectors[vectors.index.isin(all_words)])


# create document embeddings from post
def doc_embed_from_post(post):
    test_vectors = vectors_from_post(post)

    return(test_vectors.mean())

# explore our posts by cluster
def get_keyword_set_by_cluster(number):
    cluster_index = list(clustering.labels_ == number)
    return(list(itertools.compress(keyword_sets, cluster_index)))

# get sample post info by #
def get_post_info_by_cluster(number, 
                             data,
                             cluster):
    return(data[cluster.labels_ == number])

# remove HTML stuff
# https://medium.com/@jorlugaqui/how-to-strip-html-tags-from-a-string-in-python-7cb81a2bbf44
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return(re.sub(clean, '', text))

# remove "good", "nice", "thanks", etc
def remove_thanks(text):
    text = text.lower()
    
    text = re.sub("nice", "", text)
    text = re.sub("thank.*\s", " ", text)
    text = re.sub("good","", text)
    text = re.sub("hi", "", text)
    text = re.sub("hello", "", text)
    
    return(text)

def polite_post_index(forum_posts):
    '''Pass in a list of fourm posts, get
    back the indexes of short, polite ones.'''
    
    polite_indexes = []
    
    # create  custom stop word list to identify polite forum posts
    stop_word_list = ["no problem", "thanks", "thx", "thank", "great",
                      "nice", "interesting", "awesome", "perfect", 
                      "amazing", "well done", "good job"]

    # create a KeywordProcess
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(stop_word_list)

    # test our keyword processor
    for i,post in enumerate(forum_posts):
        post = post.lower().translate(str.maketrans({a:None for a in string.punctuation}))
        
        if len(post) < 100:
            keywords_found = keyword_processor.extract_keywords(post.lower(), span_info=True)
            if keywords_found:
                polite_indexes.append(i)

    return(polite_indexes)

In [None]:
import pandas as pd

In [None]:
## Hyperprameters

# number of clusters currently based on the square root of the # of posts
print(len(vectors))
days_of_posts = 20

# Top2Vec

In [None]:
!pip install llvmlite --ignore installed

In [None]:
!pip uninstall -y llvmlite
!pip install llvmlite --ignore-installed
!pip install numba==0.52.0

In [None]:
pip install top2vec

# Preprocessing posts

In [None]:
# For sample posts, get forum title and topic title
# based on queries from https://www.kaggle.com/pavlofesenko/strategies-to-earn-discussion-medals
topics = pd.read_csv('../input/meta-kaggle//ForumTopics.csv').rename(columns={'Title': 'TopicTitle'})
forums = pd.read_csv('../input/meta-kaggle/Forums.csv').rename(columns={'Title': 'ForumTitle'})

print(forum_posts.head())
df1 = pd.merge(forum_posts[['ForumTopicId', 'PostDate', 'Message']], topics[['Id', 'ForumId', 'TopicTitle']], left_on='ForumTopicId', right_on='Id')
df1 = df1.drop(['ForumTopicId', 'Id'], axis=1)

forum_posts = pd.merge(df1, forums[['Id', 'ForumTitle']], left_on='ForumId', right_on='Id')
forum_posts = forum_posts.drop(['ForumId', 'Id'], axis=1)

In [None]:
# parse dates
forum_posts['Date'] = pd.to_datetime(forum_posts.PostDate, format="%m/%d/%Y %H:%M:%S")

# posts from the last X days
start_time = datetime.now() + timedelta(days=-days_of_posts)  

# forum posts from last week (remember to convert to str)
sample_post_info = forum_posts.loc[forum_posts.Date > start_time]
sample_posts = sample_post_info.Message.astype(str)

# reindex from 0
sample_posts.reset_index(drop=True)
sample_post_info.reset_index(drop=True)

# remove html tags
sample_post_info.Message = sample_post_info.Message\
    .astype(str)\
    .apply(remove_html_tags)
sample_posts = sample_posts.apply(remove_html_tags)

# remove polite posts (make sure you remove HTML tags first)
polite_posts = sample_posts.index[polite_post_index(sample_posts)]
# posts aren't being dropped 
sample_posts = sample_posts.drop(polite_posts)
sample_post_info = sample_post_info.drop(polite_posts)

# number of posts
num_of_posts = sample_posts.shape[0]

# Number of clusters is square root of the # of posts (rounded down)
number_clusters = floor(sqrt(num_of_posts))

print(sample_posts.head())

In [None]:
# extact keywords & tokenize
#keywords = yhf.keywords_yake(sample_posts, )
keywords_tokenized = yhf.tokenizing_after_YAKE(sample_posts)
keyword_sets = [set(post) for post in keywords_tokenized]
print("keywords_tokenized\n", keywords_tokenized[0])
print("keyword_sets\n", keyword_sets[0])

# Get word vectors for keywords in post

In [None]:
# create empty array for document embeddings
doc_embeddings = np.zeros([num_of_posts, 300])

# get document embeddings for posts
for i in range(num_of_posts):
    embeddings = np.array(doc_embed_from_post(keyword_sets[i]))
    if np.isnan(embeddings).any():
        doc_embeddings[i,:] = np.zeros([1,300])
    else:
        doc_embeddings[i,:] = embeddings
    if(i==0):
        print(embeddings.shape)


# Bert Topic

In [None]:
pip install pip install awscli

In [None]:
pip install responses

In [None]:
pip install flaky

In [None]:
pip install pandas==0.23.4

In [None]:
pip install tensorflow==1.12.3

In [None]:
pip install plotly==3.10.0

In [None]:
pip install botocore==1.12.253

In [None]:
pip install s3transfer==0.2.1

In [None]:
!pip install llvmlite --ignore-installed
!pip install bertopic[visualization]

In [None]:
print(type(sample_posts))
print(sample_posts.shape)
listSent = sample_posts.tolist()

In [None]:
!pip uninstall -y numba

In [None]:
!pip install numba --ignore-installed
!pip install umap-learn==0.4.6

In [None]:
import sys
print(sys.version)

In [None]:
from bertopic import BERTopic
model = BERTopic()
topics, probabilities = model.fit_transform(listSent, doc_embeddings)

In [None]:
model.visualize_topics()

In [None]:
model.visualize_distribution(probabilities[153])

In [None]:
model.get_topic(119)

# Top2Vec code

In [None]:
print(type(sample_posts))
print(sample_posts.shape)
listSent = sample_posts.tolist()

In [None]:
from top2vec import Top2Vec
model1 = Top2Vec(documents=listSent, speed="learn", workers=8)

In [None]:
model1.get_num_topics()

In [None]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["Neural"], num_topics=5)
for topic in topic_nums:
    model.generate_topic_wordcloud(topic)

In [None]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=35, num_docs=5)

In [None]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=12, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

# Clustering!

In [None]:
clustering = SpectralClustering(n_clusters=number_clusters, 
                                assign_labels="discretize",
                                n_neighbors=number_clusters).fit_predict(doc_embeddings)

In [None]:
print(doc_embeddings.shape)
plt.scatter(doc_embeddings[:,0], doc_embeddings[:, 1], c=clustering,
            s=50, cmap='viridis');

In [None]:
for i in range(number_clusters):
    
    print(f"Cluster {i}:\n")
    print(get_post_info_by_cluster(i, 
                                   data = sample_post_info,
                                   cluster = clustering))
    print("\n")
    


In [None]:
# for i in range(number_clusters):
    
#     print(f"Cluster {i}:\n")
#     print(get_keyword_set_by_cluster(i))
#     print("\n")

**Clustering using KMeans**

In [None]:
kmeans = KMeans(n_clusters=number_clusters, random_state=0).fit(doc_embeddings)
for i in range(number_clusters):
    
    print(f"Cluster {i}:\n")
    print(get_post_info_by_cluster(i, 
                                   data = sample_post_info,
                                   cluster = kmeans))
    print("\n")

In [None]:
print("kmeans.labels_\n")
pd.Series(kmeans.labels_).value_counts()
print("sample_post")
print(type(sample_posts))


**Elbow methos for optimal K**

In [None]:
distortions = []
size = int(number_clusters/2)
K = range(1, size)
for i in K:
    kmeanModel = KMeans(i, random_state=0).fit(doc_embeddings)
    distortions.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
after_elbow_size = 4
kmeans = KMeans(4, random_state=0).fit(doc_embeddings)

In [None]:
for i in range(after_elbow_size):
    
    print(f"Cluster {i}:\n")
    print(get_post_info_by_cluster(i, 
                                   data = sample_post_info,
                                   cluster = kmeans))
    print("\n")
print("kmeans.labels_\n")
pd.Series(kmeans.labels_).value_counts()

In [None]:
testMessage = "What is deep learning and neural network. Is RNN a higher version of NN? what courses to learn from. Support vector machine are a part of neural nets, and LSTM comes under RNN"
#testMessage = "Neural network courses and competitions. Can i use LSTM for sentiment analysis"
test_sample = [testMessage]
keywords_tokenized = yhf.tokenizing_after_YAKE(test_sample)
keyword_sets = [set(post) for post in keywords_tokenized]
embeddings = np.array(doc_embed_from_post(keyword_sets[0]))
print("embeddings\n")
print(embeddings.shape)
embeddingsT = embeddings.transpose();
print("trans")
print(embeddingsT.shape)
if np.isnan(embeddings).any():
    doc_embeddings1 = np.zeros([1,300])
else:
    doc_embeddings1 = embeddings
new_embeddings = embeddings.reshape(1, -1)
clust = kmeans.predict(new_embeddings)
print(clust)
print(get_post_info_by_cluster(clust[0], 
                                   data = sample_post_info,
                                   cluster = kmeans))

TEST PCA

In [None]:
from sklearn.datasets import make_blobs
# create blobs
data = make_blobs(n_samples=200, n_features=2, centers=4, cluster_std=1.6, random_state=50)
# create np array for data points
points = data[0]
# create scatter plot
plt.scatter(data[0][:,0], data[0][:,1], c=data[1], cmap='viridis')
plt.xlim(-15,15)
plt.ylim(-15,15)

X = data[0]
X[2]

In [None]:
from sklearn.decomposition import PCA
n_clusters = 5
sklearn_pca = PCA(n_components = 2)
Y_sklearn = sklearn_pca.fit_transform(doc_embeddings)
kmeans = KMeans(n_clusters= n_clusters, max_iter=600, algorithm = 'auto')
%time fitted = kmeans.fit(Y_sklearn)
prediction = kmeans.predict(Y_sklearn)

plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1],c=prediction ,s=50, cmap='viridis')

centers2 = fitted.cluster_centers_
plt.scatter(centers2[:, 0], centers2[:, 1],c='black', s=300, alpha=0.6);

trying PCA for spectral clustering? will it work??

In [None]:
from sklearn.decomposition import PCA
n_clusters = 5
sklearn_pca = PCA(n_components = 2)
Y_sklearn = sklearn_pca.fit_transform()

spectralClusters = SpectralClustering(n_clusters=number_clusters, 
                                assign_labels="discretize",
                                n_neighbors=number_clusters)
prediction = spectralClusters.fit_predict(Y_sklearn)

plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1],c=prediction ,s=50, cmap='viridis')

#centers2 = fitted.cluster_centers_
#plt.scatter(centers2[:, 0], centers2[:, 1],c='black', s=300, alpha=0.6);

trying just k means for few sentences

In [None]:
from gensim.models import Word2Vec
  
from nltk.cluster import KMeansClusterer
import nltk
import numpy as np 
  
from sklearn import cluster
from sklearn import metrics
  
# training data
  
sentences = [['this', 'is', 'the', 'one','good', 'machine', 'learning', 'book'],
            ['this', 'is',  'another', 'book'],
            ['one', 'more', 'book'],
            ['weather', 'rain', 'snow'],
            ['yesterday', 'weather', 'snow'],
            ['forecast', 'tomorrow', 'rain', 'snow'],
            ['this', 'is', 'the', 'new', 'post'],
            ['this', 'is', 'about', 'more', 'machine', 'learning', 'post'],  
            ['and', 'this', 'is', 'the', 'one', 'last', 'post', 'book']]
  
sentences = sample_posts.tolist()    
 
model = Word2Vec(sentences, min_count=1)
 
  
def sent_vectorizer(sent, model):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = model[w]
            else:
                sent_vec = np.add(sent_vec, model[w])
            numw+=1
        except:
            pass
     
    return np.asarray(sent_vec) / numw
  
  
X=[]
for sentence in sentences:
    X.append(sent_vectorizer(sentence, model))   
 
print ("========================")
print (X)
  
 
# note with some version you would need use this (without wv) 
#  model[model.vocab] 
print (model[model.wv.vocab])
 
 
  
 
print (model.similarity('post', 'book'))
print (model.most_similar(positive=['machine'], negative=[], topn=2))
  
  
 
  
  
NUM_CLUSTERS=5
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance, repeats=25)
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
print (assigned_clusters)
  
  
  
for index, sentence in enumerate(sentences):    
    print (str(assigned_clusters[index]) + ":" + str(sentence))
 
     
     
     
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(X)
  
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
  
print ("Cluster id labels for inputted data")
print (labels)
print ("Centroids data")
print (centroids)
  
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
  
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
  
print ("Silhouette_score: ")
print (silhouette_score)
 
 
import matplotlib.pyplot as plt
 
from sklearn.manifold import TSNE
 
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
 
Y=model.fit_transform(X)
 
 
plt.scatter(Y[:, 0], Y[:, 1], c=assigned_clusters, s=290,alpha=.5)
 
 
for j in range(len(sentences)):    
   plt.annotate(assigned_clusters[j],xy=(Y[j][0], Y[j][1]),xytext=(0,0),textcoords='offset points')
   print ("%s %s" % (assigned_clusters[j],  sentences[j]))
 
 
plt.show()

In [None]:
plt.scatter(Y[:, 0], Y[:, 1], c=assigned_clusters, s=290,alpha=.5)
 
 
for j in range(len(sample_posts)):    
   plt.annotate(assigned_clusters[j],xy=(Y[j][0], Y[j][1]),xytext=(0,0),textcoords='offset points')
   print ("%s %s" % (assigned_clusters[j],  sample_posts[j]))
 
 
plt.show()

# Refining clustering

Steps:

1. Drop empty clusters
2. Identify large clusters (2 times more than expected)
3. Recluster those clusters (# clusters = sqrt # posts)



In [None]:
# count of posts/cluster
cluster_counts = pd.Series(clustering.labels_).value_counts()

# get clusters bigger than expected
max_cluster_size = number_clusters * 2
big_clusters = cluster_counts[cluster_counts > max_cluster_size]

In [None]:
# sub-cluster first (biggest) cluster
cluster_label = big_clusters.index[0]

sub_sample = sample_post_info[clustering.labels_ == cluster_label]
sub_cluster_embeddings = doc_embeddings[clustering.labels_ == cluster_label]

number_sub_clusters = floor(sqrt(sub_sample.shape[0]))

sub_cluster = SpectralClustering(n_clusters=number_sub_clusters, 
                                 assign_labels="discretize", 
                                 n_neighbors=number_sub_clusters).fit(sub_cluster_embeddings)

In [None]:
# see how it looks
for i in range(number_sub_clusters):

    print(f"Cluster {i}:\n")
    print(get_post_info_by_cluster(i, data = sub_sample, 
                                   cluster = sub_cluster))
    print("\n")

In [None]:
pd.Series(sub_cluster.labels_).value_counts()

# Word clouds

In [None]:
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# TODO why do I see thank you?
posts_as_string = sample_post_info\
    .Message\
    .to_string(index=False)

# shouldn't have to do this b/c I removed polite posts earlier
posts_as_string = remove_thanks(posts_as_string)

# Generate a word cloud image
wordcloud = WordCloud().generate(posts_as_string)

# Display the generated image:
# the matplotlib way:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Going forward

Biggest problem: redundent clusters

Possible solutions: 

* Remove very short posts
* Don't include posts on kernels
* Build filter for removing short "thanks!" type posts
* Start w/ sentiment analys & put all very high sentiment posts in a single bin

# Visualization brain storming

Slides on text visualizatoin: https://courses.cs.washington.edu/courses/cse512/15sp/lectures/CSE512-Text.pdf

* Bigram based method, reporting the two terms with the median freuquency
* term saliency, normalize by freq of most common term log(tf_w) / log(tf_the) (and then some sort of regression?)
* Termite-based model: Topics as columns, terms as rows and weight visualiation of term distinctivenes as KL divergence p(T|term)/p(T|any_term)

In [None]:
# next week: get saliency measure by day, 
# look at shift between sliency on day & in corpus as whole pick summary words

In [None]:
# TODO: 
# make sure to match preprocessing (lower cased)
# for each cluster, find the nomralized saliencey measure 
# rank words based on difference in normalizd saliency in whole corpus

# edge cases:
# OOV words, add smoothing or set corpus freq. to 0
# 

In [None]:
frequency_table = pd.read_csv("../input/kaggle-forum-term-frequency-unstemmed/kaggle_lex_freq.csv",
                             error_bad_lines=False)

In [None]:
def get_cluster_saliency_dict(cluster_number):
    # create corpus from a cluster
    text = get_post_info_by_cluster(cluster_number, data = sub_sample, cluster = sub_cluster)\
        .Message.astype(str).str.cat(sep=' ')

    # tokenize
    words = nltk.word_tokenize(text)

    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

    # Remove numbers
    words = [word for word in words if not word.isnumeric()]

    # remove non-breaking space
    words = [word for word in words if word != "nbsp"]

    # Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]

    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)

    cluster_dict = dict() 

    # get saliency measures
    for word, frequency in fdist.most_common():
        saliency_measure_smoothed = math.log(frequency + 0.0001)/(math.log(fdist.most_common(1)[0][1] + 0.0001))
        cluster_dict[word] = saliency_measure_smoothed
        
    return(cluster_dict, fdist)

In [None]:
def get_surprising_words(cluster_number, frequency_table):
    cluster_dict, fdist = get_cluster_saliency_dict(cluster_number)
    
    words = []
    surprisal = []

    for word, freq in fdist.most_common():
        words.append(word)
        surprisal_measure = cluster_dict[word] - frequency_table.saliency[frequency_table.word == word]
        if surprisal_measure.empty:
            surprisal.append(cluster_dict[word] - .0001)
        else:
            surprisal.append(surprisal_measure.values[0])

    cluster_surprisal_measures = pd.DataFrame(list(zip(words, surprisal)), 
                                              columns =['Words', 'Surprisal']) 

    suprising_words = cluster_surprisal_measures.Words[cluster_surprisal_measures.Surprisal > 0]
    
    return(suprising_words)

In [None]:
get_surprising_words(1, frequency_table)

In [None]:
get_post_info_by_cluster(1, data = sub_sample, cluster = sub_cluster).Message

In [None]:
get_surprising_words(0, frequency_table)

In [None]:
get_post_info_by_cluster(0, data = sub_sample, cluster = sub_cluster).Message