In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PySparkPreprocessing") \
    .getOrCreate()

# Read the reducer output from HDFS
data_path = ""  
df = spark.read.text(data_path)

# Parse the reducer output
split_col = split(df['value'], '\t')
df_parsed = df.withColumn('word', split_col.getItem(0)) \
              .withColumn('count', split_col.getItem(1).cast(IntegerType())) \
              .drop('value')

# Aggregate word counts 
# aggregate to find total counts
df_aggregated = df_parsed.groupBy('word').sum('count').withColumnRenamed('sum(count)', 'total_count')


df_aggregated.show()

#  a simplified scenario where we only use the word counts as features

# First, ensure there's an index to use as a key for vector assembly (a requirement)
df_indexed = df_aggregated.withColumn("id", monotonically_increasing_id())

# Assemble the 'total_count' into a feature vector
vectorAssembler = VectorAssembler(inputCols=['total_count'], outputCol='features')
df_vectorized = vectorAssembler.transform(df_indexed)

df_vectorized.select('id', 'word', 'features').show()

spark.stop()
