## Normal Pipeline
1. Quick Explore
2. Check NULL
## NLP Pipeline steps
1. Stop word removal
2. Casing removal
3. Tokenization
4. Stemming (or Lemmatization)
5. Embedding/Vectorization

In [47]:
#  Get CWD
import os
cwd = os.getcwd()
print(f'CWD: {cwd}')
import sys

hadoop_home_path = f'{cwd}/windows/winutils/hadoop-3.0.0'
os.environ['HADOOP_HOME'] = hadoop_home_path
sys.path.append(f'{hadoop_home_path}/bin')

print('HADOOP_HOME ENV var: ', os.environ['HADOOP_HOME'])

CWD: f:\Projects\sparkimental
HADOOP_HOME ENV var:  f:\Projects\sparkimental/windows/winutils/hadoop-3.0.0


In [48]:
# Imports
import findspark
from pyspark import SparkContext
from pyspark.sql.types import *
from pprint import pprint, pformat

In [49]:
# Find spark to make sure pyspark works (note: make sure only :))
findspark.init()
findspark.find()

'f:\\AppSSD\\WorkTools\\Anaconda\\envs\\sparkimental\\lib\\site-packages\\pyspark'

In [50]:
# Setup spark environment
sc = SparkContext.getOrCreate()
print('Spark version: ', sc.version)
print(f'Hadoop version = {sc._jvm.org.apache.hadoop.util.VersionInfo.getVersion()}')
print('Spark Config:')
pprint(sc.getConf().getAll())


Spark version:  3.1.2
Hadoop version = 3.2.0
Spark Config:
[('spark.sql.warehouse.dir', 'file:/f:/Projects/sparkimental/spark-warehouse'),
 ('spark.driver.host', 'ProdudePredator.mshome.net'),
 ('spark.app.startTime', '1666862767841'),
 ('spark.app.id', 'local-1666862769943'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.port', '60313'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.app.name', 'pyspark-shell')]


In [51]:
# Extra import (move later)
from pyspark import SparkFiles
from pyspark import SQLContext

In [52]:
# add data file from local system to Spark's RDD
path = cwd.replace('\\', '/').replace('f:/','')
full_path = f'file:///{path}/data/animal-crossing.csv'
sc.addFile(full_path)

In [53]:
# Create sql context
sqlContext = SQLContext(sc)

In [54]:
# read file from hdfs/rdd
file_path = SparkFiles.get('animal-crossing.csv')
print('File path in Spark File system: ', file_path)
df = sqlContext.read.csv(path=file_path, header=True, inferSchema=True)

File path in Spark File system:  C:\Users\USER\AppData\Local\Temp\spark-31a43f65-7128-4b5a-8950-5fe3648df162\userFiles-e4460f48-1d09-4d43-8ce7-f9ea58c4cbbe\animal-crossing.csv


In [55]:
df.printSchema()

root
 |-- grade: integer (nullable = true)
 |-- publication: string (nullable = true)
 |-- text: string (nullable = true)
 |-- date: string (nullable = true)



In [56]:
# Type cast
df = df.withColumn('date', df.date.cast(DateType()))
df.printSchema()

root
 |-- grade: integer (nullable = true)
 |-- publication: string (nullable = true)
 |-- text: string (nullable = true)
 |-- date: date (nullable = true)



In [57]:
# Drop useless column
df = df.select(['date', 'text', 'grade'])
print(f'Data shape: {df.count(), len(df.columns)}')
# Show data samples
df.sample(0.1, seed=0).show(10)

Data shape: (107, 3)
+----------+--------------------+-----+
|      date|                text|grade|
+----------+--------------------+-----+
|2020-03-16|With a game this ...|  100|
|2020-03-16|Similar to how Br...|  100|
|2020-03-16|Animal Crossing: ...|   93|
|2020-03-16|New Horizons has ...|   90|
|2020-03-16|It’s a blissfully...|   80|
|2020-03-23|Animal Crossing N...|   80|
|2020-03-24|Animal Crossing: ...|   98|
|2020-03-26|Quotation forthco...|   90|
|2020-04-02|If Animal Crossin...|   90|
|2020-04-08|It's an exception...|   90|
+----------+--------------------+-----+



In [58]:
# Basic stats
df.describe().show()

+-------+--------------------+-----------------+
|summary|                text|            grade|
+-------+--------------------+-----------------+
|  count|                 107|              107|
|   mean|                null| 90.6355140186916|
| stddev|                null|6.114308185868841|
|    min|A beautiful, welc...|               70|
|    max|“New Horizons mak...|              100|
+-------+--------------------+-----------------+



In [59]:
# Nan check
from pyspark.sql.functions import isnan, when, count, col

print("Nan checking")
df.select(
    [
        count(
            when(
                isnan(c)
                | col(c).isNull()
                | (col(c) == "")
                | col(c).contains("None")
                | col(c).contains("Null"),
                c,
            )
        ).alias(c)
        for c in ["grade"]
    ]
).show()

df.select(
    [
        count(
            when(
                col(c).contains("None")
                | col(c).contains("NULL")
                | (col(c) == "")
                | col(c).isNull()
                | isnan(c),
                c,
            )
        ).alias(c)
        for c in ["text"]
    ]
).show()


Nan checking
+-----+
|grade|
+-----+
|    0|
+-----+

+----+
|text|
+----+
|   0|
+----+



# NLP

Refs:
1. https://github.com/SurajMalpani/NLP-using-Spark/blob/master/nlp-using-pyspark-ml.ipynb
2. https://stackoverflow.com/questions/53579444/efficient-text-preprocessing-using-pyspark-clean-tokenize-stopwords-stemming
3. 


```Build steps into pipeline after done```


In [60]:
from pyspark.ml.feature import StopWordsRemover, RegexTokenizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer

df = df.select('text')

In [61]:
regex_tokenizer = RegexTokenizer(inputCol='text', outputCol='words', toLowercase=True)
raw_words_df = regex_tokenizer.transform(df)
raw_words_df.show(5, truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                              text|                                             words|
+--------------------------------------------------+--------------------------------------------------+
|Animal Crossing; New Horizons, much like its pr...|[animal, crossing;, new, horizons,, much, like,...|
|Know that if you’re overwhelmed with the world,...|[know, that, if, you’re, overwhelmed, with, the...|
|With a game this broad and lengthy, there’s mor...|[with, a, game, this, broad, and, lengthy,, the...|
|Animal Crossing: New Horizons is everything I h...|[animal, crossing:, new, horizons, is, everythi...|
|Above all else, Animal Crossing: New Horizons i...|[above, all, else,, animal, crossing:, new, hor...|
+--------------------------------------------------+--------------------------------------------------+
only showing top 5 rows



In [62]:
stop_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filtered_df = stop_remover.transform(raw_words_df)
filtered_df.select(['words', 'filtered']).show(5, truncate=50)
# Note how 'a' is removed

+--------------------------------------------------+--------------------------------------------------+
|                                             words|                                          filtered|
+--------------------------------------------------+--------------------------------------------------+
|[animal, crossing;, new, horizons,, much, like,...|[animal, crossing;, new, horizons,, much, like,...|
|[know, that, if, you’re, overwhelmed, with, the...|[know, you’re, overwhelmed, world,, stuck, insi...|
|[with, a, game, this, broad, and, lengthy,, the...|[game, broad, lengthy,, there’s, discuss, fit, ...|
|[animal, crossing:, new, horizons, is, everythi...|[animal, crossing:, new, horizons, everything, ...|
|[above, all, else,, animal, crossing:, new, hor...|[else,, animal, crossing:, new, horizons, unbea...|
+--------------------------------------------------+--------------------------------------------------+
only showing top 5 rows



In [67]:
from pyspark.sql.functions import udf
from pyspark.ml import Transformer
from pyspark.sql import DataFrame

# Why custom class? https://stackoverflow.com/questions/51415784/how-to-add-my-own-function-as-a-custom-stage-in-a-ml-pyspark-pipeline
class StemmerTransformer(Transformer):
    def __init__(self, inputCol: str, outputCol: str, language: str):
        super(StemmerTransformer, self).__init__()
        self._stemmer = SnowballStemmer(language=language)
        self.udf = udf(
            lambda tokens: [
                self._stemmer.stem(token) for token in tokens
            ], ArrayType(StringType()))
        self.inputCol = inputCol
        self.outputCol = outputCol

    def _transform(self, df: DataFrame) -> DataFrame:
        df = df.withColumn(self.outputCol, self.udf(self.inputCol))
        return df

stemmer = StemmerTransformer(inputCol='filtered', outputCol='stemmed', language='english')
stemmed_df = stemmer.transform(filtered_df)
stemmed_df.select('filtered', 'stemmed').show(5, truncate=50)


+--------------------------------------------------+--------------------------------------------------+
|                                          filtered|                                           stemmed|
+--------------------------------------------------+--------------------------------------------------+
|[animal, crossing;, new, horizons,, much, like,...|[anim, crossing;, new, horizons,, much, like, p...|
|[know, you’re, overwhelmed, world,, stuck, insi...|[know, you'r, overwhelm, world,, stuck, inside,...|
|[game, broad, lengthy,, there’s, discuss, fit, ...|[game, broad, lengthy,, there, discuss, fit, on...|
|[animal, crossing:, new, horizons, everything, ...|[anim, crossing:, new, horizon, everyth, hope, ...|
|[else,, animal, crossing:, new, horizons, unbea...|[else,, anim, crossing:, new, horizon, unbeat, ...|
+--------------------------------------------------+--------------------------------------------------+
only showing top 5 rows



In [68]:
count_vectorizer = CountVectorizer(inputCol='stemmed', outputCol='embedded')
cv_model = count_vectorizer.fit(stemmed_df)

embedded_df = cv_model.transform(stemmed_df)
embedded_df.select('stemmed', 'embedded').show(5, truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                           stemmed|                                          embedded|
+--------------------------------------------------+--------------------------------------------------+
|[anim, crossing;, new, horizons,, much, like, p...|(1369,[0,1,9,17,23,28,58,73,137,194,283,330,639...|
|[know, you'r, overwhelm, world,, stuck, inside,...|(1369,[1,16,21,48,67,104,128,135,220,230,312,31...|
|[game, broad, lengthy,, there, discuss, fit, on...|(1369,[3,6,9,14,15,16,21,30,31,34,36,45,60,94,9...|
|[anim, crossing:, new, horizon, everyth, hope, ...|(1369,[0,1,2,4,5,6,29,32,57,64,70,92,95,126,175...|
|[else,, anim, crossing:, new, horizon, unbeat, ...|(1369,[0,1,2,3,4,6,7,14,16,18,31,33,35,44,59,72...|
+--------------------------------------------------+--------------------------------------------------+
only showing top 5 rows



### Build the pipeline

In [69]:
stages = [
    regex_tokenizer,
    stop_remover,
    stemmer,
    count_vectorizer
]

from pyspark.ml import Pipeline

preprocess_pipeline = Pipeline(stages=stages)
pipeModel = preprocess_pipeline.fit(df)
training = pipeModel.transform(df)
training.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|               words|            filtered|             stemmed|            embedded|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Animal Crossing; ...|[animal, crossing...|[animal, crossing...|[anim, crossing;,...|(1369,[0,1,9,17,2...|
|Know that if you’...|[know, that, if, ...|[know, you’re, ov...|[know, you'r, ove...|(1369,[1,16,21,48...|
|With a game this ...|[with, a, game, t...|[game, broad, len...|[game, broad, len...|(1369,[3,6,9,14,1...|
|Animal Crossing: ...|[animal, crossing...|[animal, crossing...|[anim, crossing:,...|(1369,[0,1,2,4,5,...|
|Above all else, A...|[above, all, else...|[else,, animal, c...|[else,, anim, cro...|(1369,[0,1,2,3,4,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows

