In [65]:
import findspark
findspark.init()

import pyspark

from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, Normalizer, StandardScaler, MinMaxScaler
from pyspark.sql.functions import udf, concat, lit, col, avg, max, min, stddev, count
from pyspark.sql.types import IntegerType

from pyspark.ml.clustering import KMeans

In [2]:
# create a SparkSession: note this step was left out of the screencast
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

### Read dataset

In [3]:
stack_overflow_data = 'data/Train_onetag_small.json'

In [4]:
df = spark.read.json(stack_overflow_data)

### Build Description Length Features

In [5]:
df = df.withColumn("Desc", concat(col("Title"), lit(' '), col("Body")))

In [6]:
regexTokenizer = RegexTokenizer(inputCol="Desc", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)

In [7]:
body_length = udf(lambda x : len(x), IntegerType())
df = df.withColumn("DescLength", body_length(df.words))

In [8]:
assembler = VectorAssembler(inputCols=["DescLength"], outputCol="DescVec")
df = assembler.transform(df)

In [9]:
number_of_tags = udf(lambda x : len(x.split(" ")), IntegerType())
df = df.withColumn("NumTags", number_of_tags(df.Tags))

### Question 1

In [32]:
# How many times greater is the Description Length of the longest question than the Description Length
# of the shortest question (rounded to the nearest whole number)?

# Tip: Don't forget to import Spark SQL's aggregate functions that can operate on DataFrame columns.


maxLengDesc = df.select([max("DescLength")])

minLengDesc = df.select([min("DescLength")])

print(f'valor maximo de descripcion es \
      {maxLengDesc.collect()[0]["max(DescLength)"] // minLengDesc.collect()[0]["min(DescLength)"]} \
      veces mayor que valor minimo')

In [48]:
print(f'valor maximo de descripcion es \
      {maxLengDesc.collect()[0]["max(DescLength)"] // minLengDesc.collect()[0]["min(DescLength)"]} \
      veces mayor que valor minimo')

valor maximo de descripcion es       753       veces mayor que valor minimo


In [56]:
# What is the mean and standard deviation of the Description length?

meanLenDesc = df.select([avg("DescLength")])
stdvLenDesc = df.select([stddev("DescLength")])

print(f'media: { meanLenDesc.collect()[0]["avg(DescLength)"] }')
print(f'dev std: { stdvLenDesc.collect()[0]["stddev_samp(DescLength)"] }')

media: 180.28187
dev std: 192.10819533505023


In [62]:
# Let's use K-means to create 5 clusters of Description Lengths. 
# Set the random seed to 42 and fit a 5-class K-means model on the Description 
# Length column (you can use KMeans().setParams(...) ).
# What length is the center of the cluster representing the longest questions?

kmeans = KMeans().setParams(featuresCol="DescVec", predictionCol="DescGroup", k=5, seed=42)
model = kmeans.fit(df)
df = model.transform(df)

In [66]:
df.groupby("DescGroup").agg(avg(col("DescLength")), avg(col("NumTags")), count(col("DescLength"))).orderBy("avg(DescLength)").show()

+---------+------------------+------------------+-----------------+
|DescGroup|   avg(DescLength)|      avg(NumTags)|count(DescLength)|
+---------+------------------+------------------+-----------------+
|        0| 96.02297592997812|2.7428884026258205|            63066|
|        4|238.22969197457567|3.0864357058042886|            28634|
|        1|    492.6833982403|3.2330881292369824|             6933|
|        3|1062.4118629908103|3.2957393483709274|             1197|
|        2|2726.1882352941175|3.4235294117647057|              170|
+---------+------------------+------------------+-----------------+

