In [14]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, IDF, StringIndexer, ChiSqSelector, CountVectorizer, CountVectorizerModel
from pyspark.ml import Pipeline
import pathlib

base_path = pathlib.Path("Exercise_2/sparkly-svm/src")

In [15]:
def create_spark_session():
    """
    Setting up the Spark Session in cluster mode, managed by yarn. 
    5 Executors with each 4 cores and 4GB of RAM. 
    """
    conf = SparkConf() \
    .setAppName("svm") \
    .setMaster("yarn") \
    .set("spark.executor.memory", "4g") \
    .set("spark.driver.memory", "4g") \
    .set("spark.driver.maxResultSize", "2g") \
    .set("spark.executor.instances", "5") \
    .set("spark.executor.cores", "4") \
    .set("spark.default.parallelism", "20")

    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    return spark

In [16]:
DATA_PATH = "hdfs:///user/dic24_shared/amazon-reviews/full/reviews_devset.json"
STOPWORD_PATH = base_path.parent/"data/stopwords.txt"
OUTPUT_PATH = base_path.parent/"data/output_ds.txt"

spark = create_spark_session()

with open(STOPWORD_PATH, 'r') as file:
    stopwords = file.read().splitlines()

df = spark.read.json(str(DATA_PATH)).select("reviewText", "category")
regex = r'[ \t\d()\[\]{}.!?,;:+=\-_"\'~#@&*%€$§\/]+'

In [17]:
stages = [
    RegexTokenizer(inputCol="reviewText", outputCol="rawTerms", pattern=regex),
    StopWordsRemover(inputCol="rawTerms", outputCol="terms", stopWords=stopwords),

    CountVectorizer(inputCol="terms", outputCol="rawFeatures"),
    IDF(inputCol="rawFeatures", outputCol="features"),

    StringIndexer(inputCol="category", outputCol="label"),
    ChiSqSelector(featuresCol="features", outputCol="selectedFeatures", labelCol="label", numTopFeatures=2000),
]

pipeline = Pipeline(stages=stages)

model = pipeline.fit(df)
result = model.transform(df)
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|          reviewText|            category|            rawTerms|               terms|         rawFeatures|            features|label|    selectedFeatures|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+
|This was a gift f...|Patio_Lawn_and_Garde|[this, was, a, gi...|[gift, husband, m...|(96505,[2,3,7,8,3...|(96505,[2,3,7,8,3...| 18.0|(2000,[2,3,7,8,35...|
|This is a very ni...|Patio_Lawn_and_Garde|[this, is, a, ver...|[nice, spreader, ...|(96505,[0,1,3,21,...|(96505,[0,1,3,21,...| 18.0|(2000,[0,1,3,21,3...|
|The metal base wi...|Patio_Lawn_and_Garde|[the, metal, base...|[metal, base, hos...|(96505,[4,10,29,1...|(96505,[4,10,29,1...| 18.0|(2000,[4,10,174,3...|
|For the most part...|Patio_Lawn_and_Garde|[for, the, most, ...|[part,

In [18]:
# get all selected features through chisq index from vocabulary
cv_model: CountVectorizerModel = model.stages[2] # type: ignore
chisq_model: ChiSqSelector = model.stages[5] # type: ignore

cv_vocab: list[str] = cv_model.vocabulary
chisq_idx: list[int] = chisq_model.selectedFeatures # type: ignore

selection: list[str] = sorted(set([cv_vocab[i] for i in chisq_idx]))

print(f"{len(selection)=}, {len(cv_vocab)=}, {len(chisq_idx)=}")
with open(OUTPUT_PATH, "w") as f:
    f.write("\n".join(selection))

spark.stop()

len(selection)=2000, len(cv_vocab)=96505, len(chisq_idx)=2000
