In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
import re
from nltk.stem.snowball import PorterStemmer

df = pd.read_csv('BLOGS.csv', sep = ',' , encoding = 'ISO-8859-1') #your csv file
df.head()

In [None]:
df.shape

In [None]:
df['Content'].str.len()

In [None]:
stemmer = PorterStemmer("english")
 
STOPWORDS = my_stop_words = set(["a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves", "student", "students", "school", "schools", "teach", "teacher", 
    "teachers", "teaching", "time", "year", "work", "use","using" ,"uses",
    "like", "make", "need", "think", "question", "lesson", "lessons", "post", "blog",
    "rss","feed","subscribe","able","really","new","thanks","just","learn","learning",
    "education","tweethttps","did","does","got","sharefacebooktwittergooglelinkedinpinteresttumblr",
    "twitterfacebooklike","facebooktwittergoogleprintmoreemaillinkedinreddit","people",
    "http","ive","dont","link"])

In [None]:
regex = r"jQuery(.*)}\)"
df['Content'] = df['Content'].str.replace(regex, " ")
df.shape

In [None]:
def clean_text(text): #tokenise and remove stopwords
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    stems = [stemmer.stem(t) for t in cleaned_text]
    return stems

In [None]:
df.drop_duplicates(subset = ['Content'], inplace = False)
df.shape

In [None]:
mask = (df['Content'].str.len() >= 280)  #KEEPS posts that are longer than 280 characters.
df = df.loc[mask]
df.shape

In [None]:
df.head(20)

In [None]:
from sklearn.utils import shuffle
df = shuffle(df)

In [None]:
X = df['Content']  #X has now changed.
X.shape

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10 #your choice of topics
 
#vectorizer = TfidfVectorizer(stop_words=my_stop_words, lowercase=True, analyzer='word', 
                             #token_pattern='[a-zA-Z][a-zA-Z]{2,}')
vectorizer = CountVectorizer(lowercase=True, analyzer='word', tokenizer = clean_text)
                             #token_pattern='[a-zA-Z][a-zA-Z]{2,}')
data_vectorized = vectorizer.fit_transform(X)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, random_state = 100)
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

In [None]:
from sklearn.externals import joblib
joblib.dump(lda_Z,  'BLOGSLDA10.pkl')
#dump(clf, '2004LDA12.joblib') 

In [None]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

In [None]:
import numpy as np

def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model = lda_model, n_words=10)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

In [None]:
df_topic_keywords.to_csv('BLOGSkeywordsSKLCV10.csv', sep = ',')

In [None]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_Z, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

In [None]:
df_document_topicsa = df_document_topic.head(15)
df_document_topicsa.to_csv('BLOGSDominantTopics10.csv')

In [None]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

In [None]:
df_topic_distribution.to_csv('BLOGSsklCV10TD.csv', sep = ',')

In [None]:
from sklearn.externals import joblib
lda_Z = joblib.load('BLOGSLDA10.pkl')

In [None]:
# Construct the k-means clusters
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
import numpy as np

clusters = KMeans(n_clusters=10, random_state=100).fit_predict(lda_Z)

# Build the Singular Value Decomposition(SVD) model
svd_model = TruncatedSVD(n_components=2)  # 2 components
lda_Z_svd = svd_model.fit_transform(lda_Z)

# X and Y axes of the plot using SVD decomposition
x = lda_Z_svd[:, 0]
y = lda_Z_svd[:, 1]

# Weights for the 15 columns of lda_output, for each component
print("Component's weights: \n", np.round(svd_model.components_, 2))

# Percentage of total information in 'lda_output' explained by the two components
print("Perc of Variance Explained: \n", np.round(svd_model.explained_variance_ratio_, 2))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 12))
plt.scatter(x, y, c=clusters, cmap = 'Paired')
plt.xlabel('Component 2')
plt.xlabel('Component 1')
plt.title("BLOGS Segregation of Topic Clusters, t=10", )
plt.savefig('BLOGSsegregationSKLCV12') #figure will be saved as saved_figure.png
#plt.savefig('segregation10NG.pdf')

In [None]:
from sklearn.manifold import TSNE

# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results 
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

# 20-D -> 2-D
tsne_lda = tsne_model.fit_transform(lda_Z)

In [None]:
from bokeh.plotting import figure, show, save
from bokeh.models import HoverTool
from bokeh.io import output_notebook
import matplotlib.colors as mcolors

n_top_words = 5 # number of keywords we show

# 20 colors
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5",
    "#331a00", "#1a0033", "#ff80bf", "#80ffff", "#cdffff"
])

#cols = [color for name, color in mcolors.XKCD_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

In [None]:
_lda_keys = []
for i in range(lda_Z.shape[0]):
  _lda_keys +=  lda_Z[i].argmax(),

topic_summaries = []
topic_word = lda_model.components_  # all topic words
vocab = vectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
  topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get!
  topic_summaries.append(' '.join(topic_words)) # append!

In [None]:
title = 'BLOGS Topic Modelling SKL t=10'
num_example = len(lda_Z)

#title = df['Title']

plot_lda = figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,previewsave")#,
                     #x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1],
                 color=colormap[_lda_keys][:num_example])
#source=bp.ColumnDataSource
#({
                   #"content": X[:num_example],
                   #"topic_key": _lda_keys[:num_example]
                   #})

In [None]:
topic_coord = np.empty((lda_Z.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(lda_Z.shape[1]):
  plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

save(plot_lda, '{}.html'.format(title))


In [None]:
show(plot_lda)

In [None]:
#Hurrah!