In [1]:
import os
import requests
import codecs
import re
import nltk
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType
from tqdm import tqdm
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.clustering import KMeans
from pyspark.ml.pipeline import Pipeline
import numpy as np
import pandas as pd

In [2]:
from pyspark.sql.types import StructField, StructType

In [3]:
#Create SparkConf
sparkConf =SparkConf().setAppName('SentenceVectorExploration').setMaster('local[*]')
#Create SparkContext
sc=SparkContext(conf=sparkConf)

In [4]:
spark = SparkSession(sc)

In [5]:
df = spark.read.option("header", "true").csv("./data/clean/train/*", sep= "\t")

In [6]:
df.show(5)

+----------+--------------------+--------------------+
|     genre|           sentence1|           sentence2|
+----------+--------------------+--------------------+
| telephone|yeah pay fee amer...|american express ...|
|government|now will embark r...|people reviewing ...|
| telephone|umhum okay think ...|oak tree leaves p...|
|government|fiscal year 2002 ...|beginning fiscal ...|
| telephone|well argument one...|statistically con...|
+----------+--------------------+--------------------+
only showing top 5 rows



In [7]:
temp_rdd = df.rdd.map(lambda x: (x.genre, "{} {}".format(x.sentence1, x.sentence2))).collect()

In [8]:
df_agg = spark.createDataFrame(temp_rdd, schema= StructType([
    StructField("genre", StringType()),
    StructField("sentence", StringType())
]))

In [9]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [10]:
hashingTf = HashingTF(inputCol="words", outputCol="tf", numFeatures= 512)

In [11]:
idf = IDF(inputCol="tf", outputCol="tfidf", minDocFreq= 1)

In [18]:
kmeans = KMeans(featuresCol="tfidf", predictionCol="predCluster", k= 5, seed= 1024)

In [19]:
pipeline = Pipeline(stages=[tokenizer, hashingTf, idf, kmeans])

In [20]:
model = pipeline.fit(df_agg)

In [21]:
df_result = model.transform(df_agg)

In [22]:
df_result.show(5)

+----------+--------------------+--------------------+--------------------+--------------------+-----------+
|     genre|            sentence|               words|                  tf|               tfidf|predCluster|
+----------+--------------------+--------------------+--------------------+--------------------+-----------+
| telephone|yeah pay fee amer...|[yeah, pay, fee, ...|(512,[9,14,84,138...|(512,[9,14,84,138...|          0|
|government|now will embark r...|[now, will, embar...|(512,[24,73,108,1...|(512,[24,73,108,1...|          1|
| telephone|umhum okay think ...|[umhum, okay, thi...|(512,[9,39,52,81,...|(512,[9,39,52,81,...|          2|
|government|fiscal year 2002 ...|[fiscal, year, 20...|(512,[1,9,31,32,3...|(512,[1,9,31,32,3...|          1|
| telephone|well argument one...|[well, argument, ...|(512,[31,39,45,11...|(512,[31,39,45,11...|          0|
+----------+--------------------+--------------------+--------------------+--------------------+-----------+
only showing top 5 

In [23]:
df_temp = df_result.groupBy(["predCluster", "genre"]).count().groupBy(["predCluster"]).agg(F.max("count").alias("count"))
df_temp2 = df_result.groupBy(["predCluster", "genre"]).count().drop("predCluster")
df_join = df_temp.join(df_temp2, on="count", how="inner")
df_cluster2genre = df_join.select(["predCluster", "genre"]).sort("predCluster")
df_cluster2genre.show()

+-----------+----------+
|predCluster|     genre|
+-----------+----------+
|          0|   fiction|
|          1|government|
|          2| telephone|
|          3|    travel|
|          4|    travel|
+-----------+----------+



* ~~由上面的第二个表可以看出, 在考虑每个簇被分为哪类的时候, 按照0,2,1,3,4的顺序进行考虑~~
* 由上面的表可以看出:
    * cluster0 认为是`fiction`
    * cluster1 认为是`government`
    * cluster2 认为是`telephone`
    * cluster3 认为是`travel`
    * cluster4 认为是`travel`

In [24]:
cluster2genre = {}
for cluster_id, genre_name in df_cluster2genre.select(["predCluster", "genre"]).rdd.map(lambda x: (x.predCluster, x.genre)).take(5):
    cluster2genre[cluster_id] = genre_name

In [25]:
def map_cluster2genre(x):
    return cluster2genre[x]

In [26]:
udf_map_cluster2genre = F.udf(map_cluster2genre, StringType())

In [27]:
df_result_2 = df_result.withColumn("predGenre", udf_map_cluster2genre(df_result.predCluster))

In [28]:
true_labels = []
pred_labels = []
for row in df_result_2.select(["genre", "predGenre"]).take(df_result_2.count()):
    true_labels.append(row.genre)
    pred_labels.append(row.predGenre)

In [30]:
df_result_3 = df_result_2.groupBy(["genre", "predGenre"]).count().sort(["genre", "predGenre"])

In [31]:
df.groupBy("genre").count().show()

+----------+-----+
|     genre|count|
+----------+-----+
|    travel|77350|
|     slate|77305|
|   fiction|77272|
|government|77350|
| telephone|83345|
+----------+-----+



In [32]:
genre2total = {
    "travel": 77350,
    "slate": 77305,
    "fiction":77272,
 "government":77350,
 "telephone": 83345
}
def get_percentage(genre, count):
    # return "{}".format(count)
    return "{:.2f}%".format((float(count) / genre2total[genre] * 100))

In [33]:
udf_get_ratio = F.udf(get_percentage, StringType())

In [34]:
confusion_table = df_result_3.withColumn("percent", udf_get_ratio(df_result_3["genre"], df_result_3["count"]))
confusion_table.show()

+----------+----------+-----+-------+
|     genre| predGenre|count|percent|
+----------+----------+-----+-------+
|   fiction|   fiction|70199| 90.85%|
|   fiction|government| 4867|  6.30%|
|   fiction| telephone|  390|  0.50%|
|   fiction|    travel| 1816|  2.35%|
|government|   fiction|25722| 33.25%|
|government|government|47276| 61.12%|
|government| telephone|  980|  1.27%|
|government|    travel| 3372|  4.36%|
|     slate|   fiction|47886| 61.94%|
|     slate|government|25653| 33.18%|
|     slate| telephone|  591|  0.76%|
|     slate|    travel| 3175|  4.11%|
| telephone|   fiction|53424| 64.10%|
| telephone|government| 3450|  4.14%|
| telephone| telephone|24933| 29.92%|
| telephone|    travel| 1538|  1.85%|
|    travel|   fiction|33397| 43.18%|
|    travel|government|38924| 50.32%|
|    travel| telephone|  838|  1.08%|
|    travel|    travel| 4191|  5.42%|
+----------+----------+-----+-------+



In [35]:
result = confusion_table.select(["genre", "predGenre", "percent"]).take(confusion_table.count())

In [36]:
result_data = {}
for row in result:
    temp = result_data.get(row.genre, {})
    temp[row.predGenre] = row.percent
    result_data[row.genre] = temp

In [37]:
genres = list(genre2total.keys())
pd_data = [{} for i in range(len(genres))]
for genre1 in genres:
    for index, genre2 in enumerate(genres):
        pd_data[index][genre1] = result_data[genre1].get(genre2, "0.00%")

In [38]:
print(pd.DataFrame(pd_data, index= genres))

            travel   slate fiction government telephone
travel       5.42%   4.11%   2.35%      4.36%     1.85%
slate        0.00%   0.00%   0.00%      0.00%     0.00%
fiction     43.18%  61.94%  90.85%     33.25%    64.10%
government  50.32%  33.18%   6.30%     61.12%     4.14%
telephone    1.08%   0.76%   0.50%      1.27%    29.92%
