In [1]:
import findspark
findspark.init()

import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, concat, lit, col, avg, desc, explode, min, max, split
from pyspark.sql.types import IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


from pyspark.ml.feature import RegexTokenizer, VectorAssembler, Normalizer, StandardScaler, MinMaxScaler
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer

import re

In [2]:
# create a SparkSession: note this step was left out of the screencast
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

## Read in the dataset

In [3]:
stack_overflow_data = 'data/Train_onetag_small.json'

In [4]:
df = spark.read.json(stack_overflow_data)
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

### Dividir en train y test

In [5]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

# train, rest = df.randomSplit([0.6, 0.4], seed=42)
# test, validation = rest.randomSplit([0.5, 0.5], seed=42)

### Tuberia

In [6]:
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
cv = CountVectorizer(inputCol="words", outputCol="TF", vocabSize=10000)
idf = IDF(inputCol="TF", outputCol="features")
indexer = StringIndexer(inputCol="oneTag", outputCol="label")

lr = LogisticRegression(maxIter=10, regParam=0.0, elasticNetParam=0)

pipeline = Pipeline(stages=[regexTokenizer, cv, idf, indexer, lr])


In [None]:
pmodel = pipeline.fit(train)


In [None]:
results = pmodel.transform(test)
result.head()

In [None]:
print(result.filter(result.label == result.prediction).count())
print(result.count())

# divida y debe dar 0.38522...

### CrossValidation

In [12]:
paramGrid = ParamGridBuilder().addGrid(cv.vocabSize, [1000, 5000]).addGrid(lr.regParam, [0.0, 0.1]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3)

### Evaluacion de resultados

In [13]:
cvModel = crossval.fit(train)

cvModel.avgMetrics

[0.3041985815210171,
 0.23266051199487747,
 0.3624639927038321,
 0.2820422997543126]

In [14]:
result2 = cvModel.transform(test)

print(result2.filter(result2.label == result2.prediction).count())
print(result2.count())

7802
20099
