In [1]:
import os
import requests
import codecs
import re
import nltk
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from tqdm import tqdm
from pyspark.sql.types import StructField, StructType
from pyspark import StorageLevel

In [2]:
#Create SparkConf
sparkConf =SparkConf().setAppName('VocabularyExploration').setMaster('local[*]')
#Create SparkContext
sc=SparkContext(conf=sparkConf)

In [3]:
spark = SparkSession(sc)

In [4]:
df_dev_matched = spark.read.option("header", "true").csv("./data/clean/dev_matched/*", sep="\t")
df_dev_mismatched = spark.read.option("header", "true").csv("./data/clean/dev_mismatched/*", sep="\t")
df_test_matched = spark.read.option("header", "true").csv("./data/clean/test_matched/*", sep="\t")
df_test_mismatched = spark.read.option("header", "true").csv("./data/clean/test_mismatched/*", sep="\t")

In [5]:
matched_rdd = df_dev_matched.select(["sentence1", "sentence2"])\
                        .rdd.map(lambda x: "{} {}".format(x.sentence1,x.sentence2)).union(\
df_test_matched.select(["sentence1", "sentence2"]).\
                        rdd.map(lambda x: "{} {}".format(x.sentence1,x.sentence2))\
    ).flatMap(lambda x: x.split(" ")).\
map(lambda x: (x,1)).reduceByKey(lambda x,y: 1).map(lambda x: x[0])

In [6]:
mismatched_rdd = df_dev_mismatched.select(["sentence1", "sentence2"])\
                        .rdd.map(lambda x: "{} {}".format(x.sentence1,x.sentence2)).union(\
df_test_mismatched.select(["sentence1", "sentence2"]).\
                        rdd.map(lambda x: "{} {}".format(x.sentence1,x.sentence2))\
    ).flatMap(lambda x: x.split(" ")).\
map(lambda x: (x,1)).reduceByKey(lambda x,y: 1).map(lambda x: x[0])

## first workload

1. the number of common words between matched and mismatched sets

In [7]:
matched_rdd.union(mismatched_rdd).map(lambda x: (1,1)).reduceByKey(lambda x,y: x+y).map(lambda x: x[1]).collect()

[37630]

2. the number of words unique to the matched sets

In [8]:
matched_rdd.subtract(mismatched_rdd).map(lambda x: (1,1)).reduceByKey(lambda x,y: x+y).map(lambda x: x[1]).collect()

[10740]

3. the number of words unique to the mismatched sets

In [9]:
mismatched_rdd.subtract(matched_rdd).map(lambda x: (1,1)).reduceByKey(lambda x,y: x+y).map(lambda x: x[1]).collect()

[8160]

## second workload

### consider the corpus without filter by stopwords

In [10]:
df_train = spark.read.option("header", "true").csv("./data/clean/nofilter_train/*", sep="\t")

In [11]:
rdd_train = df_train.rdd.map(lambda x: (x.genre,"{} {}".format(x.sentence1,x.sentence2)))

In [12]:
def one2many(x):
    temp = x[1].split(" ")
    return [(word, x[0]) for word in temp]

In [13]:
rdd_word2count = rdd_train.reduceByKey(lambda x,y: "{} {}".format(x, y)).flatMap(lambda x: one2many(x)).\
    reduceByKey(lambda x,y: "{} {}".format(x,y)).map(lambda x: (x[0], set(x[1].split(" ")))).\
        map(lambda x: (x[0], len(x[1])))

In [14]:
total = rdd_word2count.count()
rdd_word2count.map(lambda x: (x[1], 1)).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], x[1]/total)).collect()

[(1, 0.6396532227666792),
 (2, 0.13291985059795086),
 (3, 0.07808198837222584),
 (4, 0.06094872585637757),
 (5, 0.0883962124067665)]

### consider the corpus filter by stopwords

In [15]:
df_train = spark.read.option("header", "true").csv("./data/clean/train/*", sep="\t")

In [16]:
rdd_train = df_train.rdd.map(lambda x: (x.genre,"{} {}".format(x.sentence1,x.sentence2)))

In [17]:
rdd_word2count = rdd_train.reduceByKey(lambda x,y: "{} {}".format(x, y)).flatMap(lambda x: one2many(x)).\
    reduceByKey(lambda x,y: "{} {}".format(x,y)).map(lambda x: (x[0], set(x[1].split(" ")))).\
        map(lambda x: (x[0], len(x[1])))

In [18]:
total = rdd_word2count.count()
rdd_word2count.map(lambda x: (x[1], 1)).reduceByKey(lambda x,y: x+y).map(lambda x: (x[0], x[1]/total)).collect()

[(1, 0.6408088067011489),
 (2, 0.13315329335835585),
 (3, 0.07819151370897606),
 (4, 0.06100379914862453),
 (5, 0.08684258708289468)]