In [1]:
import os
import requests
import codecs
import re
import nltk
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType
from tqdm import tqdm
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.pipeline import Pipeline
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import StructField, StructType

In [2]:
#Create SparkConf
sparkConf =SparkConf().setAppName('SentenceVectorExplorationV2').setMaster('local[*]')
#Create SparkContext
sc=SparkContext(conf=sparkConf)

In [3]:
spark = SparkSession(sc)

In [4]:
df = spark.read.option("header", "true").csv("./data/clean/train/*", sep= "\t")

In [5]:
df.show(5)

+----------+--------------------+--------------------+
|     genre|           sentence1|           sentence2|
+----------+--------------------+--------------------+
| telephone|yeah pay fee amer...|american express ...|
|government|now will embark r...|people reviewing ...|
| telephone|umhum okay think ...|oak tree leaves p...|
|government|fiscal year 2002 ...|beginning fiscal ...|
| telephone|well argument one...|statistically con...|
+----------+--------------------+--------------------+
only showing top 5 rows



In [6]:
temp_rdd = df.rdd.map(lambda x: (x.genre, "{} {}".format(x.sentence1, x.sentence2))).collect()

In [7]:
df_agg = spark.createDataFrame(temp_rdd, schema= StructType([
    StructField("genre", StringType()),
    StructField("sentence", StringType())
]))

In [8]:
def partition_sentence2vector(partition):
    embed = hub.Module("./data/pretrained/")
    #rev_text_list = sentence.split(" ")
    generes = []
    sentences = []
    for item in partition:
        generes.append(item[0])
        sentences.append(item[1])
    #print(generes)
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(sentences))
    # return  [(generes[i], message_embeddings[i]) for i in range(message_embeddings.shape[0])]
    return [(genre,vector) for genre,vector in zip(generes,message_embeddings)]

In [9]:
temp_rdd = df_agg.rdd.map(lambda x: (x.genre,x.sentence)).mapPartitions(partition_sentence2vector).map(lambda x: (x[0], Vectors.dense(x[1])))

In [10]:
df_new = temp_rdd.toDF( schema= StructType([
    StructField("genre", StringType()),
    StructField("sentenceVec", VectorUDT())
]))

In [11]:
kmeans = KMeans(featuresCol="sentenceVec", predictionCol="predCluster", k= 5, seed= 1024)

In [13]:
model = kmeans.fit(df_new)

In [14]:
df_result = model.transform(df_new)

In [15]:
df_result.show(5)

+----------+--------------------+-----------+
|     genre|         sentenceVec|predCluster|
+----------+--------------------+-----------+
| telephone|[0.04928012937307...|          2|
|government|[-0.0333102568984...|          2|
| telephone|[0.03496768698096...|          0|
|government|[0.01194196287542...|          2|
| telephone|[0.03472994640469...|          2|
+----------+--------------------+-----------+
only showing top 5 rows



In [16]:
df_temp = df_result.groupBy(["predCluster", "genre"]).count().groupBy(["predCluster"]).agg(F.max("count").alias("count"))
df_temp2 = df_result.groupBy(["predCluster", "genre"]).count().drop("predCluster")
df_join = df_temp.join(df_temp2, on="count", how="inner")
df_cluster2genre = df_join.select(["predCluster", "genre"]).sort("predCluster")
df_cluster2genre.show()

+-----------+----------+
|predCluster|     genre|
+-----------+----------+
|          0|    travel|
|          1| telephone|
|          2|government|
|          3|     slate|
|          4|   fiction|
+-----------+----------+



* ~~由上面的第二个表可以看出, 在考虑每个簇被分为哪类的时候, 按照0,2,1,3,4的顺序进行考虑~~
* 由上面的表可以看出:
    * cluster0 认为是`travel`
    * cluster1 认为是`telephone`
    * cluster2 认为是`government`
    * cluster3 认为是`slate`
    * cluster4 认为是`fiction`

In [17]:
cluster2genre = {}
for cluster_id, genre_name in df_cluster2genre.select(["predCluster", "genre"]).rdd.map(lambda x: (x.predCluster, x.genre)).take(5):
    cluster2genre[cluster_id] = genre_name

In [18]:
def map_cluster2genre(x):
    return cluster2genre[x]

In [19]:
udf_map_cluster2genre = F.udf(map_cluster2genre, StringType())

In [20]:
df_result_2 = df_result.withColumn("predGenre", udf_map_cluster2genre(df_result.predCluster))

In [21]:
true_labels = []
pred_labels = []
for row in df_result_2.select(["genre", "predGenre"]).take(df_result_2.count()):
    true_labels.append(row.genre)
    pred_labels.append(row.predGenre)

In [22]:
df_result_3 = df_result_2.groupBy(["genre", "predGenre"]).count().sort(["genre", "predGenre"])

In [23]:
df.groupBy("genre").count().show()

+----------+-----+
|     genre|count|
+----------+-----+
|    travel|77350|
|     slate|77305|
|   fiction|77272|
|government|77350|
| telephone|83345|
+----------+-----+



In [36]:
genre2total = {
    "travel": 77350,
    "slate": 77305,
    "fiction":77272,
 "government":77350,
 "telephone": 83345
}
def get_percentage(genre, count):
#     return "{}".format(count)
    return "{:.2f}%".format((float(count) / genre2total[genre] * 100))

In [37]:
udf_get_ratio = F.udf(get_percentage, StringType())

In [38]:
confusion_table = df_result_3.withColumn("percent", udf_get_ratio(df_result_3["genre"], df_result_3["count"]))
confusion_table.show()

+----------+----------+-----+-------+
|     genre| predGenre|count|percent|
+----------+----------+-----+-------+
|   fiction|   fiction|48396| 62.63%|
|   fiction|government| 1719|  2.22%|
|   fiction|     slate| 7047|  9.12%|
|   fiction| telephone|16068| 20.79%|
|   fiction|    travel| 4042|  5.23%|
|government|   fiction| 2356|  3.05%|
|government|government|65317| 84.44%|
|government|     slate| 7481|  9.67%|
|government| telephone| 1059|  1.37%|
|government|    travel| 1137|  1.47%|
|     slate|   fiction|17865| 23.11%|
|     slate|government|12128| 15.69%|
|     slate|     slate|38691| 50.05%|
|     slate| telephone| 5015|  6.49%|
|     slate|    travel| 3606|  4.66%|
| telephone|   fiction|43330| 51.99%|
| telephone|government| 8641| 10.37%|
| telephone|     slate|10986| 13.18%|
| telephone| telephone|16629| 19.95%|
| telephone|    travel| 3759|  4.51%|
+----------+----------+-----+-------+
only showing top 20 rows



In [39]:
result = confusion_table.select(["genre", "predGenre", "percent"]).take(confusion_table.count())

In [40]:
result_data = {}
for row in result:
    temp = result_data.get(row.genre, {})
    temp[row.predGenre] = row.percent
    result_data[row.genre] = temp

In [41]:
genres = list(genre2total.keys())
pd_data = [{} for i in range(len(genres))]
for genre1 in genres:
    for index, genre2 in enumerate(genres):
        pd_data[index][genre1] = result_data[genre1].get(genre2, "0.00%")

In [42]:
print(pd.DataFrame(pd_data, index= genres))

            travel   slate fiction government telephone
travel      80.23%   4.66%   5.23%      1.47%     4.51%
slate        9.91%  50.05%   9.12%      9.67%    13.18%
fiction      7.16%  23.11%  62.63%      3.05%    51.99%
government   2.20%  15.69%   2.22%     84.44%    10.37%
telephone    0.51%   6.49%  20.79%      1.37%    19.95%
