In [0]:
import pyspark

In [0]:
from pyspark.ml.feature import Tokenizer,RegexTokenizer
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [0]:
sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

In [0]:
sentenceDataFrame.show()

In [0]:
tokenizer=Tokenizer(inputCol='sentence',outputCol='words')

In [0]:
regex_tokenizer=RegexTokenizer(inputCol='sentence',outputCol='words',pattern='\\W')

In [0]:
count_tokens=udf(lambda words:len(words),IntegerType())

In [0]:
tokenized=tokenizer.transform(sentenceDataFrame)

In [0]:
tokenized.show()

In [0]:
tokenized.withColumn("tokens",count_tokens(col("words"))).show(truncate=False)

In [0]:
regindexed=regex_tokenizer.transform(sentenceDataFrame)

In [0]:
regindexed.show()

In [0]:
regindexed.withColumn("tokens",count_tokens(col("words"))).show(truncate=False)

In [0]:
from pyspark.ml.feature import StopWordsRemover

In [0]:
sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])


In [0]:
remover=StopWordsRemover(inputCol='raw',outputCol='filtered')

In [0]:
remover.transform(sentenceData).show(truncate=False)

In [0]:
from pyspark.ml.feature import NGram

In [0]:
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])


In [0]:
ngram=NGram(n=2,inputCol="words",outputCol="ngrams",)

In [0]:
ngram.transform(wordDataFrame).show(truncate=False)

In [0]:
from pyspark.ml.feature import HashingTF,IDF,Tokenizer

In [0]:
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

sentenceData.show()

In [0]:
tokenizer=Tokenizer(inputCol="sentence",outputCol="words")

In [0]:
wordsData=tokenizer.transform(sentenceData)

In [0]:
wordsData.show(truncate=False)

In [0]:
hashingTf=HashingTF(inputCol="words",outputCol="rawfeatures")

In [0]:
featurizedData=hashingTf.transform(wordsData)

In [0]:
idf=IDF(inputCol="rawfeatures",outputCol="features")

In [0]:
idfModel=idf.fit(featurizedData)

In [0]:
rescaledData=idfModel.transform(featurizedData)

In [0]:
rescaledData.select("label", "features").show()

In [0]:
from pyspark.ml.feature import CountVectorizer

In [0]:
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

In [0]:
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

In [0]:
model=cv.fit(df)

In [0]:
result=model.transform(df)

In [0]:
result.show(truncate=False)
