### Natural Language Processing using PySpark

In [1]:
#Importing pySpark and creating pyspark session
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NLP").getOrCreate()
spark

In [2]:
#Importing required libraries
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#For pipeline development
from pyspark.ml import Pipeline

#### Kickstarter Dataset

"Kickstarter is an American public-benefit corporation based in Brooklyn, New York, that maintains a global crowdfunding platform, focused on creativity and merchandising. The company's stated mission is to "help bring creative projects to life". Kickstarter, has reportedly received more than $1.9 billion in pledges from 9.4 million backers to fund 257,000 creative projects, such as films, music, stage shows, comics, journalism, video games, technology and food-related projects.

People who back Kickstarter projects are offered tangible rewards or experiences in exchange for their pledges. This model traces its roots to subscription model of arts patronage, where artists would go directly to their audiences to fund their work" ~ Wikipedia

The datastet contains the blurbs or short description of 215,513 projects runned along 2017, all written in english and all labeled with "successful" or "failed", if they get the money or not, respectively. From those texts you can train linguistics models for description, and even embeddings relative to the case.

**Source:** https://www.kaggle.com/oscarvilla/kickstarter-nlp

In [23]:
df = spark.read.csv('kickstarter.csv', inferSchema=True,header=True)

In [24]:
df.limit(4).toPandas()

Unnamed: 0,_c0,blurb,state
0,1,"Using their own character, users go on educati...",failed
1,2,"MicroFly is a quadcopter packed with WiFi, 6 s...",successful
2,3,"A small indie press, run as a collective for a...",failed
3,4,Zylor is a new baby cosplayer! Back this kicks...,failed


In [25]:
df.show(4, False)

+---+-----------------------------------------------------------------------------------------------------------------------------------+----------+
|_c0|blurb                                                                                                                              |state     |
+---+-----------------------------------------------------------------------------------------------------------------------------------+----------+
|1  |Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills (ie Physics). |failed    |
|2  |MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 processors for ultimate stability -- and fits in the palm of your hand.|successful|
|3  |A small indie press, run as a collective for authors who want to self-publish, and a sexy, smart , hilarious novel!                |failed    |
|4  |Zylor is a new baby cosplayer! Back this kickstarter to help fund new cosplay photoshoots to share hi

In [26]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- blurb: string (nullable = true)
 |-- state: string (nullable = true)



In [27]:
df.count()

223627

In [28]:
#Getting the null values present in the column
def null_value_calc(df):
    null_columns_count = []
    numRows = df.count()
    for k in df.columns:
        nullRows = df.where(col(k).isNull()).count()
        if(nullRows > 0):
            temp = k,nullRows,(nullRows/numRows)*100
            null_columns_count.append(temp)
    return(null_columns_count)

null_columns_calc_list = null_value_calc(df)
spark.createDataFrame(null_columns_calc_list, ['ColumnName', 'Null_Values_Count','Null_Value_Percent']).show()

+----------+-----------------+------------------+
|ColumnName|Null_Values_Count|Null_Value_Percent|
+----------+-----------------+------------------+
|     blurb|             1488|0.6653937136392296|
|     state|            13157| 5.883457722010312|
+----------+-----------------+------------------+



I will be needing both the values from the row in order to develop the model

In [29]:
#Just checking the count of the rows if we drop null value rows
df.na.drop().count()

210470

The above number shows that very insignificant number of rows will be dropped, I can drop these rows

In [30]:
df = df.dropna()

In [31]:
#Checking the values in state column
df.groupBy("state").count().show()

+--------------------+-----+
|               state|count|
+--------------------+-----+
| is the largest (...|    1|
| that's goal is t...|    1|
| will keep audien...|    1|
| 2011 from my ind...|    1|
| but to bring it ...|    1|
| Goodnight"" was ...|    1|
| a book about res...|    1|
| apart from her o...|    1|
|       a samurai cat|    1|
| travelling the w...|    1|
|  World Pride 2012."|    1|
|"" a horror film ...|    1|
| it costs quite a...|    1|
| WE can make it a...|    1|
| she needs your h...|    1|
|          unexpected|    1|
|"" is recorded! W...|    1|
|"" which is set t...|    1|
|              desire|    1|
| Dance Major and ...|    1|
+--------------------+-----+
only showing top 20 rows



In [32]:
df.groupBy("state").count().orderBy(col("count").desc()).show()

+--------------------+------+
|               state| count|
+--------------------+------+
|          successful|103582|
|              failed|102000|
| and get some col...|     8|
|     their childhood|     6|
|          ","failed"|     6|
|                love|     6|
| about a lonely f...|     5|
|            mastered|     4|
|              poetry|     4|
|             romance|     4|
| She Wrote"" but ...|     3|
|             Texas."|     3|
|                NY."|     3|
|              2014."|     3|
|               2014"|     3|
|                loss|     3|
| solid surface on...|     3|
| ""Tomorrow Comes...|     3|
|              2011."|     3|
|                  CD|     3|
+--------------------+------+
only showing top 20 rows



In [33]:
#Filtering the unnecessary column names from the dataframe
df = df.filter("state IN('successful', 'failed')")

In [34]:
df.groupBy("state").count().orderBy(col("count").desc()).show()

+----------+------+
|     state| count|
+----------+------+
|successful|103582|
|    failed|102000|
+----------+------+



### Text Cleaning

In [35]:
#Looking into the blurb column to see how the sentences are:
df.select("blurb").show(10,False)

+-----------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------+
|Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills (ie Physics). |
|MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 processors for ultimate stability -- and fits in the palm of your hand.|
|A small indie press, run as a collective for authors who want to self-publish, and a sexy, smart , hilarious novel!                |
|Zylor is a new baby cosplayer! Back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world!    |
|Hatoful Boyfriend meet Skeletons! A comedy Dating Sim that pu

There is a need to pre process these sentences

In [36]:
#Removing the '/' '(' ')' from the sentences
df = df.withColumn('blurb', translate(col("blurb"), "/", " ")) \
       .withColumn('blurb', translate(col("blurb"), "(", " ")) \
       .withColumn('blurb', translate(col("blurb"), ")", " "))

In [37]:
df.select("blurb").show(10,False)

+-----------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------+
|Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills  ie Physics . |
|MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 processors for ultimate stability -- and fits in the palm of your hand.|
|A small indie press, run as a collective for authors who want to self-publish, and a sexy, smart , hilarious novel!                |
|Zylor is a new baby cosplayer! Back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world!    |
|Hatoful Boyfriend meet Skeletons! A comedy Dating Sim that pu

In [38]:
#Removing the punctuations using regex replace (replacing everything not a letter)
df = df.withColumn("blurb", regexp_replace(col('blurb'), '[^A-Za-z ]+', ''))

In [39]:
df.select("blurb").show(10,False)

+-------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                          |
+-------------------------------------------------------------------------------------------------------------------------------+
|Using their own character users go on educational quests around a virtual world leveling up subjectoriented skills  ie Physics |
|MicroFly is a quadcopter packed with WiFi  sensors and  processors for ultimate stability  and fits in the palm of your hand   |
|A small indie press run as a collective for authors who want to selfpublish and a sexy smart  hilarious novel                  |
|Zylor is a new baby cosplayer Back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world  |
|Hatoful Boyfriend meet Skeletons A comedy Dating Sim that puts you into a high school ful

In [40]:
#Removing the white spaces using a regex call
df = df.withColumn("blurb", regexp_replace(col('blurb'), ' +', ' '))

In [41]:
df.select("blurb").show(10,False)

+------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                         |
+------------------------------------------------------------------------------------------------------------------------------+
|Using their own character users go on educational quests around a virtual world leveling up subjectoriented skills ie Physics |
|MicroFly is a quadcopter packed with WiFi sensors and processors for ultimate stability and fits in the palm of your hand     |
|A small indie press run as a collective for authors who want to selfpublish and a sexy smart hilarious novel                  |
|Zylor is a new baby cosplayer Back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world |
|Hatoful Boyfriend meet Skeletons A comedy Dating Sim that puts you into a high school full of Sk

In [42]:
#Lower casing all the words
df = df.withColumn("blurb", lower(col('blurb')))

In [43]:
df.select("blurb").show(10,False)

+------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                         |
+------------------------------------------------------------------------------------------------------------------------------+
|using their own character users go on educational quests around a virtual world leveling up subjectoriented skills ie physics |
|microfly is a quadcopter packed with wifi sensors and processors for ultimate stability and fits in the palm of your hand     |
|a small indie press run as a collective for authors who want to selfpublish and a sexy smart hilarious novel                  |
|zylor is a new baby cosplayer back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world |
|hatoful boyfriend meet skeletons a comedy dating sim that puts you into a high school full of sk

## Preparing data for NLP 

#### Tokeninzing words present in the sentences
This process splits the text into words

In [44]:
regex_tokenizer = RegexTokenizer(inputCol="blurb", outputCol="words", pattern="\\W") #Looking for anything that looks like a word
raw_words = regex_tokenizer.transform(df)
raw_words.show(2,False)

+---+------------------------------------------------------------------------------------------------------------------------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0|blurb                                                                                                                         |state     |words                                                                                                                                            |
+---+------------------------------------------------------------------------------------------------------------------------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|1  |using their own character users go on educational quests around a virtual world leveling up subjectoriented skills ie physics

In [45]:
raw_words.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- blurb: string (nullable = true)
 |-- state: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



Removing the stop words

In [46]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

In [47]:
stopwords = remover.getStopWords()
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [48]:
words_df = remover.transform(raw_words)

In [49]:
words_df.limit(4).toPandas()

Unnamed: 0,_c0,blurb,state,words,filtered
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que..."
1,2,microfly is a quadcopter packed with wifi sens...,successful,"[microfly, is, a, quadcopter, packed, with, wi...","[microfly, quadcopter, packed, wifi, sensors, ..."
2,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors..."
3,4,zylor is a new baby cosplayer back this kickst...,failed,"[zylor, is, a, new, baby, cosplayer, back, thi...","[zylor, new, baby, cosplayer, back, kickstarte..."


In [50]:
indexer = StringIndexer(inputCol="state", outputCol="label")
feature_data = indexer.fit(words_df).transform(words_df)
feature_data.show(5)
feature_data.printSchema()

+---+--------------------+----------+--------------------+--------------------+-----+
|_c0|               blurb|     state|               words|            filtered|label|
+---+--------------------+----------+--------------------+--------------------+-----+
|  1|using their own c...|    failed|[using, their, ow...|[using, character...|  1.0|
|  2|microfly is a qua...|successful|[microfly, is, a,...|[microfly, quadco...|  0.0|
|  3|a small indie pre...|    failed|[a, small, indie,...|[small, indie, pr...|  1.0|
|  4|zylor is a new ba...|    failed|[zylor, is, a, ne...|[zylor, new, baby...|  1.0|
|  5|hatoful boyfriend...|    failed|[hatoful, boyfrie...|[hatoful, boyfrie...|  1.0|
+---+--------------------+----------+--------------------+--------------------+-----+
only showing top 5 rows

root
 |-- _c0: string (nullable = true)
 |-- blurb: string (nullable = true)
 |-- state: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-

### Creating a ML Pipeline
We could also create an ML Pipeline to accomplish the previous three steps in a more streamlined fashion. Pipelines allow users to combine any transformer call(s) and ONE estimator call in their ML workflow. So a Pipeline can be a continuous set of transformer calls until you reach a point where you need to call ".fit()" which is an estimator call.

In [51]:
######################## BEFORE #############################
# Tokenize
regex_tokenizer = RegexTokenizer(inputCol="blurb", outputCol="words", pattern="\\W") # These also work as well: "\W", r"\W"
raw_words = regex_tokenizer.transform(df)

# Remove Stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
words_df = remover.transform(raw_words)

# Zero Index Label Column
indexer = StringIndexer(inputCol="state", outputCol="label")
feature_data = indexer.fit(words_df).transform(words_df)

feature_data.show(1,False)

+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+-----+
|_c0|blurb                                                                                                                         |state |words                                                                                                                                            |filtered                                                                                                                  |label|
+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------

In [52]:
################# AFTER ##################

# Tokenize
regex_tokenizer = RegexTokenizer(inputCol="blurb", outputCol="words", pattern="\\W")
# raw_words = regex_tokenizer.transform(df)

# Remove Stop words
remover = StopWordsRemover(inputCol=regex_tokenizer.getOutputCol(), outputCol="filtered")
# words_df = remover.transform(raw_words)

# Zero Index Label Column
indexer = StringIndexer(inputCol="state", outputCol="label")
# feature_data = indexer.fit(words_df).transform(words_df)

# Create the Pipeline
pipeline = Pipeline(stages=[regex_tokenizer,remover,indexer])
data_prep_pl = pipeline.fit(df)
# print(type(data_prep_pl))
# print(" ")
# Now call on the Pipeline to get our final df
feature_data = data_prep_pl.transform(df)
feature_data.show(1,False)

+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+-----+
|_c0|blurb                                                                                                                         |state |words                                                                                                                                            |filtered                                                                                                                  |label|
+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------

In [53]:
feature_data.limit(5).toPandas()

Unnamed: 0,_c0,blurb,state,words,filtered,label
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",1.0
1,2,microfly is a quadcopter packed with wifi sens...,successful,"[microfly, is, a, quadcopter, packed, with, wi...","[microfly, quadcopter, packed, wifi, sensors, ...",0.0
2,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",1.0
3,4,zylor is a new baby cosplayer back this kickst...,failed,"[zylor, is, a, new, baby, cosplayer, back, thi...","[zylor, new, baby, cosplayer, back, kickstarte...",1.0
4,5,hatoful boyfriend meet skeletons a comedy dati...,failed,"[hatoful, boyfriend, meet, skeletons, a, comed...","[hatoful, boyfriend, meet, skeletons, comedy, ...",1.0
