In [1]:
import os
import re
import shutil
import argparse
import numpy as np
import pandas as pd
from heapq import nlargest
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import HiveContext
from collections import defaultdict
from pyspark.sql.functions import udf
from pyspark.context import SparkContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, Tokenizer
from pyspark.mllib.linalg import Vectors, SparseVector, DenseVector

In [2]:
# version 1
# def read_line(line):
#     if line is None:
#         return None
#     line = line.split('\t')[-1]
#     line = re.sub('[^A-Za-z_]', '', line)
#     line = re.sub(' +', ' ', line.replace('_', ' '))
#     return line.strip()

In [3]:
# version 1
# def read_data(sc, data_path):
#     if sc is None or data_path is None:
#         return None
#     documents = sc.textFile(data_path).map(lambda line : read_line(line))
#     schema = StructType([StructField("activity", StringType(), True)])
#     use this if the row has only one column
#     documents = documents.map (lambda x: Row(x))
#     documents = SQLContext(sc).createDataFrame(documents, schema)
    
#     tokenizer = Tokenizer(inputCol="activity", outputCol="tokens")
#     documents = tokenizer.transform(documents)

#     remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
#     documents = remover.transform(documents)
#     return documents

In [4]:
def read_line(row):
    if row is None:
        return None
    row = row.split(',')[0]
    row_formatted = re.sub('[^A-Za-z_]', '', row)
    row_formatted = re.sub(' +', ' ', row_formatted.replace('_', ' '))
    return row, row_formatted.strip()

In [5]:
def read_data(sc, data_path):
    if sc is None or data_path is None:
        return None
    documents = sc.textFile(data_path).map(lambda line : read_line(line))
    schema = StructType([StructField("raw_activity", StringType(), True),
                         StructField("activity", StringType(), True)])
    documents = SQLContext(sc).createDataFrame(documents, schema)  
    tokenizer = Tokenizer(inputCol="activity", outputCol="tokens")
    documents = tokenizer.transform(documents)

    remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
    documents = remover.transform(documents)
    return documents

In [6]:
def tf_idf_transform(documents):
    if documents is None:
        return None
    countVectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="feature_counts")
    countVectorizerModel = countVectorizer.fit(documents)
    documents = countVectorizerModel.transform(documents)
    idf = IDF(inputCol="feature_counts", outputCol="features")
    idfModel = idf.fit(documents)
    documents = idfModel.transform(documents)
    return countVectorizerModel.vocabulary, documents
    

In [7]:
def clustering(documents, output_path, k=2, max_iter=20):
    if documents is None or output_path is None:
        return None
    output_path = os.getcwd() + "/" + output_path + "/" + str(k)
    if os.path.exists(output_path):
        shutil.rmtree(output_path) 
    os.makedirs(output_path)
    out_file = open(os.path.join(output_path, "cost.txt"), "w")
    kmeans = KMeans(featuresCol="features").setK(k).setMaxIter(max_iter)
    km_model = kmeans.fit(documents)
    clustersTable = km_model.transform(documents)
    clusterCenters = km_model.clusterCenters()
    wssse = km_model.computeCost(documents)
    out_file.write("Cluster {}".format(k))
    out_file.write("\n")
    out_file.write("cost:" + str(wssse))
    out_file.write("\n")
    return clustersTable, clusterCenters

In [8]:
def extract(row):
    return (row.prediction, ) + tuple(row.features.toArray().tolist())

In [9]:
def sparse_add(v1, v2):
    #assert isinstance(v1, SparseVector) and isinstance(v2, SparseVector)
    assert v1.size == v2.size
    values = defaultdict(float) # Dictionary with default value 0.0
    # Add values from v1
    for i in range(v1.indices.size):
        values[v1.indices[i]] += v1.values[i]
    # Add values from v2
    for i in range(v2.indices.size):
        values[v2.indices[i]] += v2.values[i]
    return Vectors.sparse(v1.size, dict(values))

In [10]:
def sparse_divide(v):
    #assert isinstance(v, SparseVector) 
    values = defaultdict(float) # Dictionary with default value 0.0
     # Add values from v[0]
    for i in range(v[0].indices.size):
        values[v[0].indices[i]] = v[0].values[i] / float(v[1])
    return Vectors.sparse(v[0].size, dict(values))

In [11]:
def get_top_index(v, k):
    #assert isinstance(v, SparseVector)
    dct = {}
    for i in range(v.indices.size):
        dct[v.indices[i]] = v.values[i]
    return nlargest(k, dct, key=dct.get)
    

In [12]:
# Very Basic implementation of top k words
# def get_top_keywords(clustersTable, vocab, output_path, n_terms=10, k=2):
#     if clustersTable is None or vocab is None or output_path is None:
#         return None
#     if not os.path.exists(output_path):
#         os.makedirs(output_path)
#     out_path = os.getcwd() + "/" + output_path + "/" + str(k)
#     out_file = open(os.path.join(out_path, "out.txt"), "w")
#     clusters = clustersTable.select("features","prediction")
#     clusters = clusters.rdd.map(extract).toDF(["prediction"]) 
#     clusters = clusters.groupby("prediction").mean()
#     clusters = clusters.drop("prediction")
#     clusters = clusters.drop("avg(prediction)")
#     clusters_array = np.array(clusters.collect())
#     for idx, row in enumerate(clusters_array):
#         out_file.write("cluster {}".format(idx))
#         out_file.write("\n")
#         out_file.write(",".join([vocab[t] for t in np.argsort(row)[-n_terms:]]))
#         out_file.write("\n")
#     out_file.close()
#     return None         

In [13]:
# Basic implementation of top k words
def get_top_keywords(clustersTable, vocab, output_path, n_terms=10, k=2):
    if clustersTable is None or vocab is None or output_path is None:
        return None
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    out_path = os.getcwd() + "/" + output_path + "/" + str(k)
    out_file = open(os.path.join(out_path, "out.txt"), "w")
    clusters = clustersTable.select("prediction", "features")
    clusters_array = clusters.rdd \
        .mapValues(lambda v: (v.toArray(), 1)) \
        .reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1])) \
        .mapValues(lambda v: v[0]/v[1]).collect()
    for row in clusters_array:
        out_file.write("cluster {}".format(row[0]))
        out_file.write("\n")
        out_file.write(",".join([vocab[t] for t in np.argsort(row[1])[-n_terms:]]))
        out_file.write("\n")
    out_file.close()
    return clusters_array

In [14]:
# Advanced implementation of top k words
def get_top_keywords_test(clustersTable, vocab, output_path, n_terms=10, k=2):
    if clustersTable is None or vocab is None or output_path is None:
        return None
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    out_path = os.getcwd() + "/" + output_path + "/" + str(k)
    out_file = open(os.path.join(out_path, "out.txt1"), "w")
    clusters = clustersTable.select("prediction", "features")
    clusters_array = clusters.rdd \
        .mapValues(lambda v: (v, 1)) \
        .reduceByKey(lambda a,b: (sparse_add(a[0], b[0]), a[1]+b[1])) \
        .mapValues(lambda v: sparse_divide(v)) \
        .mapValues(lambda v: get_top_index(v, n_terms)).collect()
    for row in clusters_array:
        out_file.write("cluster {}".format(row[0]))
        out_file.write("\n")
        out_file.write(",".join([vocab[t] for t in row[1]]))
        out_file.write("\n")
    out_file.close()
    return clusters_array

In [15]:
def run_clustering(sc, input_path, output_path, num_clusters_list, max_iter=20):
    if sc is None or input_path is None or num_clusters_list is None or output_path is None:
        return None
    documents = read_data(sc, input_path)
    vocab, documents = tf_idf_transform(documents)
    for k in num_clusters_list:
        clustersTable, clusterCenters = clustering(documents, output_path, k, max_iter)
        write_results(sc, clustersTable, output_path, k)
        #get_top_keywords(clustersTable, vocab, output_path, max_iter, k)
        get_top_keywords_test(clustersTable, vocab, output_path, max_iter, k)
    return clustersTable, clusterCenters, documents

In [16]:
def write_results(sc, clustersTable, output_path, k):
    if sc is None or clustersTable is None or output_path is None:
        return None
    output_path = output_path + '/' + str(k) + '/results'
    if os.path.exists(output_path):
        shutil.rmtree(output_path) 
    resultsdf = clustersTable.select('activity', 'prediction')
    resultsdf.write.format('csv').option('delimiter', '\t').option('header', 'true').save(output_path)
    return None

In [18]:
sc = SparkContext(conf=SparkConf().setAppName("tf-idf_clustering_on_spark"))
clustersTable, clusterCenters, documents = run_clustering(sc, 'test_data_bck', 'output', [3])
clustersTable = clustersTable.withColumn('dist', dist(clustersTable.features, clustersTable.prediction))
clustersTable.show(1)

