In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer,VectorAssembler,Normalizer,StandardScaler
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import re

In [2]:
# Creating a Spark Session 
spark=SparkSession.builder.master('local').appName('Word Count').getOrCreate()

## Reading the dataset 

In [8]:
# Printing the columns and the number of rows
df=spark.read.json('../data/Train_onetag_small.json')
df.columns,df.count()

(['Body', 'Id', 'Tags', 'Title', 'oneTag'], 100000)

In [4]:
# Understanding the data
df.show(3)

+--------------------+---+--------------------+--------------------+-------+
|                Body| Id|                Tags|               Title| oneTag|
+--------------------+---+--------------------+--------------------+-------+
|<p>I'd like to ch...|  1|php image-process...|How to check if a...|    php|
|<p>In my favorite...|  2|             firefox|How can I prevent...|firefox|
|<p>I am import ma...|  3|r matlab machine-...|R Error Invalid t...|      r|
+--------------------+---+--------------------+--------------------+-------+
only showing top 3 rows



## Tokenization
Tokenization splits strings into separate words. Spark has a [Tokenizer](https://spark.apache.org/docs/latest/ml-features.html#tokenizer) class as well as RegexTokenizer, which allows for more control over the tokenization process.

In [10]:
regexTokenizer=RegexTokenizer(inputCol='Body',outputCol='words',pattern='\\W')
df=regexTokenizer.transform(df)
df.show()

+--------------------+---+--------------------+--------------------+----------------+--------------------+
|                Body| Id|                Tags|               Title|          oneTag|               words|
+--------------------+---+--------------------+--------------------+----------------+--------------------+
|<p>I'd like to ch...|  1|php image-process...|How to check if a...|             php|[p, i, d, like, t...|
|<p>In my favorite...|  2|             firefox|How can I prevent...|         firefox|[p, in, my, favor...|
|<p>I am import ma...|  3|r matlab machine-...|R Error Invalid t...|               r|[p, i, am, import...|
|<p>This is probab...|  4|     c# url encoding|How do I replace ...|              c#|[p, this, is, pro...|
|<pre><code>functi...|  5|php api file-get-...|How to modify who...|             php|[pre, code, funct...|
|<p>I am using a m...|  6|proxy active-dire...|setting proxy in ...|active-directory|[p, i, am, using,...|
|<p>My image is ca...|  7|           

In [14]:
# Counting the number of paragraphs and links in each body tag
num_para=udf(lambda x : len(re.findall("</p>",x)),IntegerType())
num_links=udf(lambda x: len(re.findall("</a>",x)),IntegerType())

In [16]:
df=df.withColumn('NumParas',num_para(df.Body))
df=df.withColumn('NumLinks',num_links(df.Body))
df.select(['Body','words','NumParas','NumLinks']).show()

+--------------------+--------------------+--------+--------+
|                Body|               words|NumParas|NumLinks|
+--------------------+--------------------+--------+--------+
|<p>I'd like to ch...|[p, i, d, like, t...|       2|       0|
|<p>In my favorite...|[p, in, my, favor...|       2|       0|
|<p>I am import ma...|[p, i, am, import...|       4|       0|
|<p>This is probab...|[p, this, is, pro...|       7|       1|
|<pre><code>functi...|[pre, code, funct...|       2|       0|
|<p>I am using a m...|[p, i, am, using,...|       1|       0|
|<p>My image is ca...|[p, my, image, is...|       9|       0|
|<p>I've decided t...|[p, i, ve, decide...|       4|       0|
|<p>Do you know of...|[p, do, you, know...|       4|       0|
|<p>I'm using SQL ...|[p, i, m, using, ...|       3|       0|
|<p>Some commercia...|[p, some, commerc...|       4|       1|
|<p>This may sound...|[p, this, may, so...|       4|       0|
|<p>how can I move...|[p, how, can, i, ...|       1|       0|
|<p>Few 

In [19]:
# Counting the number of words in each body tag

num_words=udf(lambda x : len(x),IntegerType())
df=df.withColumn("BodyLength",num_words(df.words))

In [20]:
df.select(['Body','words','NumParas','NumLinks','BodyLength']).show()

+--------------------+--------------------+--------+--------+----------+
|                Body|               words|NumParas|NumLinks|BodyLength|
+--------------------+--------------------+--------+--------+----------+
|<p>I'd like to ch...|[p, i, d, like, t...|       2|       0|        83|
|<p>In my favorite...|[p, in, my, favor...|       2|       0|        71|
|<p>I am import ma...|[p, i, am, import...|       4|       0|      3161|
|<p>This is probab...|[p, this, is, pro...|       7|       1|       115|
|<pre><code>functi...|[pre, code, funct...|       2|       0|       148|
|<p>I am using a m...|[p, i, am, using,...|       1|       0|        69|
|<p>My image is ca...|[p, my, image, is...|       9|       0|       112|
|<p>I've decided t...|[p, i, ve, decide...|       4|       0|       161|
|<p>Do you know of...|[p, do, you, know...|       4|       0|       102|
|<p>I'm using SQL ...|[p, i, m, using, ...|       3|       0|        67|
|<p>Some commercia...|[p, some, commerc...|       4

## Vector Assembler
Since NumParas, Numlinks and BodyLength can be a important feature set, we can combine all together with the help of VectorAssembler

In [21]:
assembler=VectorAssembler(inputCols=['NumParas','NumLinks','BodyLength'],outputCol='NumFeatures')
df=assembler.transform(df)

In [22]:
df.select(['Body','NumFeatures']).show()

+--------------------+----------------+
|                Body|     NumFeatures|
+--------------------+----------------+
|<p>I'd like to ch...|  [2.0,0.0,83.0]|
|<p>In my favorite...|  [2.0,0.0,71.0]|
|<p>I am import ma...|[4.0,0.0,3161.0]|
|<p>This is probab...| [7.0,1.0,115.0]|
|<pre><code>functi...| [2.0,0.0,148.0]|
|<p>I am using a m...|  [1.0,0.0,69.0]|
|<p>My image is ca...| [9.0,0.0,112.0]|
|<p>I've decided t...| [4.0,0.0,161.0]|
|<p>Do you know of...| [4.0,0.0,102.0]|
|<p>I'm using SQL ...|  [3.0,0.0,67.0]|
|<p>Some commercia...| [4.0,1.0,134.0]|
|<p>This may sound...| [4.0,0.0,144.0]|
|<p>how can I move...|  [1.0,0.0,19.0]|
|<p>Few month ago ...|  [2.0,0.0,42.0]|
|<p>When you hit a...|  [1.0,0.0,37.0]|
|<p>A lot of frame...|[12.0,0.0,385.0]|
|<p>I'm running a ...| [4.0,0.0,373.0]|
|<p>Hello<br>
I'm ...| [3.0,0.0,209.0]|
|<p>Does anyone kn...| [4.0,1.0,116.0]|
|<p>=) I need your...| [8.0,0.0,390.0]|
+--------------------+----------------+
only showing top 20 rows



## Normalizing the Features
It brings the features on the same scale such that sum of all the elements of a row is 1.

In [23]:
scaler=Normalizer(inputCol='NumFeatures',outputCol='ScaledNumFeatures')
df=scaler.transform(df)

In [25]:
df.select(['Body','NumFeatures','ScaledNumFeatures']).head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", NumFeatures=DenseVector([2.0, 0.0, 83.0]), ScaledNumFeatures=DenseVector([0.0241, 0.0, 0.9997])),
 Row(Body='<p>In my favorite editor (vim), I regularly use ctrl-w to execute a certain action. Now, it quite often happens to me that firefox is the active window (on windows) while I still look at vim (thinking vim is the active window) and press ctrl-w which closes firefox. This is not what I want. Is there a way to stop ctrl-w from closing firefox?</p>\n\n<p>Rene</p>\n', NumFeatures=DenseVector([2.0, 0.0, 71.0]), ScaledNumFeatures=DenseVector([0.0282, 0.0, 0.9996]))]

## StandardScaler
Another way to bring values on the same scale is to Standardize the data such the mean is 0 and all the observations are with in one std units.

In [26]:
scaler2=StandardScaler(inputCol='NumFeatures',outputCol='Scaled2NumFeatures')
scalerModel=scaler2.fit(df)
df=scalerModel.transform(df)

In [27]:
df.select(['Body','NumFeatures','ScaledNumFeatures','Scaled2NumFeatures']).head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", NumFeatures=DenseVector([2.0, 0.0, 83.0]), ScaledNumFeatures=DenseVector([0.0241, 0.0, 0.9997]), Scaled2NumFeatures=DenseVector([0.7037, 0.0, 0.4325])),
 Row(Body='<p>In my favorite editor (vim), I regularly use ctrl-w to execute a certain action. Now, it quite often happens to me that firefox is the active window (on windows) while I still look at vim (thinking vim is the active window) and press ctrl-w which closes firefox. This is not what I want. Is there a way to stop ctrl-w from closing firefox?</p>\n\n<p>Rene</p>\n', NumFeatures=DenseVector([2.0, 0.0, 71.0]), Sc