In [32]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
import nltk
from nltk.corpus import twitter_samples
import pandas as pd

from pyspark.sql.functions import split, explode, regexp_replace, lower, regexp_extract
from pyspark.sql.functions import row_number, monotonically_increasing_id as identity, col, length
from pyspark.sql.functions import lag, lead
from pyspark.sql.window import Window

# Creating Spark Session

In [2]:
spark = SparkSession.builder.appName('SparkNlp').getOrCreate()

# Downloading the Twitter Sample Data

In [3]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\sammy\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

# Extracting Twitter Positive and Negative Data

In [4]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [5]:
# positive and negative documents
pos_tw = [(t, 'pos') for t in twitter_samples.strings('positive_tweets.json')]
neg_tw = [(t, 'neg') for t in twitter_samples.strings('negative_tweets.json')]

# joining documents
document = [pos_tw] + [neg_tw]

# list to dataframe
df = pd.DataFrame(document[0]).append(pd.DataFrame(document[1])).rename(columns={0:'text', 1:'label'})

# Create Resilient Distributed Dataset

In [6]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
df_rdd = spark.createDataFrame(df)

In [7]:
df_rdd.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = true)



# Create Temp Table Reference

In [8]:
df_rdd.createOrReplaceTempView('SqlNlp')

# Exploring the Text Data

In [9]:
df_rdd.select('text').show(4, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                          |
+------------------------------------------------------------------------------------------------------------------------------+
|#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)               |
|@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!|
|@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!                   |
|@97sides CONGRATS :)                                                                                                          |
+------------------------------------------------------------------------------------------------

# Cleaning the Text Data

In [10]:
smilies = [':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3', ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';(', '(', ')', 'via']

## Cleaning Invalid Characters

In [11]:
df_clean = df_rdd.withColumn('clean_text', lower(regexp_replace('text', '[^a-zA-Z ]', '')))
df_clean.show(5)

+--------------------+-----+--------------------+
|                text|label|          clean_text|
+--------------------+-----+--------------------+
|#FollowFriday @Fr...|  pos|followfriday fran...|
|@Lamb2ja Hey Jame...|  pos|lambja hey james ...|
|@DespiteOfficial ...|  pos|despiteofficial w...|
|@97sides CONGRATS :)|  pos|     sides congrats |
|yeaaaah yippppy!!...|  pos|yeaaaah yippppy  ...|
+--------------------+-----+--------------------+
only showing top 5 rows



## Splitting the Words

In [60]:
df_split = df_clean.select(split('clean_text', ' ').alias('words'), 'label')
df_split.show(4, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------+-----+
|words                                                                                                                               |label|
+------------------------------------------------------------------------------------------------------------------------------------+-----+
|[followfriday, franceinte, pkuchly, milipolparis, for, being, top, engaged, members, in, my, community, this, week, ]               |pos  |
|[lambja, hey, james, how, odd, , please, call, our, contact, centre, on, , and, we, will, be, able, to, assist, you, , many, thanks]|pos  |
|[despiteofficial, we, had, a, listen, last, night, , as, you, bleed, is, an, amazing, track, when, are, you, in, scotland]          |pos  |
|[sides, congrats, ]                                                                                                                 |pos  |
+------------

## Exploding the Words into Column

In [62]:
df_exp = df_split.select(explode('words').alias('word'), 'label')
df_exp.show()

+------------+-----+
|        word|label|
+------------+-----+
|followfriday|  pos|
|  franceinte|  pos|
|     pkuchly|  pos|
|milipolparis|  pos|
|         for|  pos|
|       being|  pos|
|         top|  pos|
|     engaged|  pos|
|     members|  pos|
|          in|  pos|
|          my|  pos|
|   community|  pos|
|        this|  pos|
|        week|  pos|
|            |  pos|
|      lambja|  pos|
|         hey|  pos|
|       james|  pos|
|         how|  pos|
|         odd|  pos|
+------------+-----+
only showing top 20 rows



## Filtering Blanks

In [63]:
df_exp = df_exp.filter(col('word')!='')
df_exp.show()

+------------+-----+
|        word|label|
+------------+-----+
|followfriday|  pos|
|  franceinte|  pos|
|     pkuchly|  pos|
|milipolparis|  pos|
|         for|  pos|
|       being|  pos|
|         top|  pos|
|     engaged|  pos|
|     members|  pos|
|          in|  pos|
|          my|  pos|
|   community|  pos|
|        this|  pos|
|        week|  pos|
|      lambja|  pos|
|         hey|  pos|
|       james|  pos|
|         how|  pos|
|         odd|  pos|
|      please|  pos|
+------------+-----+
only showing top 20 rows



## Removing Stopwords

In [64]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stopset = set(stopwords.words('english'))
df_exp = df_exp.filter(~col('word').isin(stopset))
df_exp.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sammy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+------------+-----+
|        word|label|
+------------+-----+
|followfriday|  pos|
|  franceinte|  pos|
|     pkuchly|  pos|
|milipolparis|  pos|
|         top|  pos|
|     engaged|  pos|
|     members|  pos|
|   community|  pos|
|        week|  pos|
|      lambja|  pos|
|         hey|  pos|
|       james|  pos|
|         odd|  pos|
|      please|  pos|
|        call|  pos|
|     contact|  pos|
|      centre|  pos|
|        able|  pos|
|      assist|  pos|
|        many|  pos|
+------------+-----+
only showing top 20 rows



## Creating Identity

In [65]:
df_exp = df_exp.withColumn('id', identity())
df_exp.show()

+------------+-----+---+
|        word|label| id|
+------------+-----+---+
|followfriday|  pos|  0|
|  franceinte|  pos|  1|
|     pkuchly|  pos|  2|
|milipolparis|  pos|  3|
|         top|  pos|  4|
|     engaged|  pos|  5|
|     members|  pos|  6|
|   community|  pos|  7|
|        week|  pos|  8|
|      lambja|  pos|  9|
|         hey|  pos| 10|
|       james|  pos| 11|
|         odd|  pos| 12|
|      please|  pos| 13|
|        call|  pos| 14|
|     contact|  pos| 15|
|      centre|  pos| 16|
|        able|  pos| 17|
|      assist|  pos| 18|
|        many|  pos| 19|
+------------+-----+---+
only showing top 20 rows



# Creating Sliding Window

In [66]:
df_exp.createOrReplaceTempView('WindowTutorial')

In [67]:
spark.sql("""
SELECT
    id,
    LAG(word, 1) OVER(ORDER BY id) AS w1,
    word,
    LEAD(word, 1) OVER(ORDER BY id) AS w2
FROM WindowTutorial
""").show()

+---+------------+------------+------------+
| id|          w1|        word|          w2|
+---+------------+------------+------------+
|  0|        null|followfriday|  franceinte|
|  1|followfriday|  franceinte|     pkuchly|
|  2|  franceinte|     pkuchly|milipolparis|
|  3|     pkuchly|milipolparis|         top|
|  4|milipolparis|         top|     engaged|
|  5|         top|     engaged|     members|
|  6|     engaged|     members|   community|
|  7|     members|   community|        week|
|  8|   community|        week|      lambja|
|  9|        week|      lambja|         hey|
| 10|      lambja|         hey|       james|
| 11|         hey|       james|         odd|
| 12|       james|         odd|      please|
| 13|         odd|      please|        call|
| 14|      please|        call|     contact|
| 15|        call|     contact|      centre|
| 16|     contact|      centre|        able|
| 17|      centre|        able|      assist|
| 18|        able|      assist|        many|
| 19|     

In [68]:
w = Window.orderBy('id')
df_exp.select(
    'id',
    lag('word', 1).over(w).alias('w1'),
    'word',
    lead('word', 1).over(w).alias('w2')
).show()

+---+------------+------------+------------+
| id|          w1|        word|          w2|
+---+------------+------------+------------+
|  0|        null|followfriday|  franceinte|
|  1|followfriday|  franceinte|     pkuchly|
|  2|  franceinte|     pkuchly|milipolparis|
|  3|     pkuchly|milipolparis|         top|
|  4|milipolparis|         top|     engaged|
|  5|         top|     engaged|     members|
|  6|     engaged|     members|   community|
|  7|     members|   community|        week|
|  8|   community|        week|      lambja|
|  9|        week|      lambja|         hey|
| 10|      lambja|         hey|       james|
| 11|         hey|       james|         odd|
| 12|       james|         odd|      please|
| 13|         odd|      please|        call|
| 14|      please|        call|     contact|
| 15|        call|     contact|      centre|
| 16|     contact|      centre|        able|
| 17|      centre|        able|      assist|
| 18|        able|      assist|        many|
| 19|     

# Repartition the RDD

In [69]:
df_exp.rdd.getNumPartitions()

4

In [70]:
df_repart = df_exp.repartition('id')

In [71]:
df_repart.rdd.getNumPartitions()

200

# Sliding Window as Subquery: Most common 3-tuples

In [78]:
spark.sql("""
SELECT label, w1, w2, w3, w4, COUNT(1) AS phrase_count 
FROM (
    SELECT
        label,
        word AS w1,
        LEAD(word, 1) OVER(ORDER BY id) AS w2,
        LEAD(word, 2) OVER(ORDER BY id) AS w3,
        LEAD(word, 3) OVER(ORDER BY id) AS w4
    FROM WindowTutorial
)
GROUP BY label, w1, w2, w3, w4
ORDER BY COUNT(1) DESC
""").show()

+-----+----------------+----------------+----------------+----------------+------------+
|label|              w1|              w2|              w3|              w4|phrase_count|
+-----+----------------+----------------+----------------+----------------+------------+
|  pos|             amp|httptcorcvcyyoiq|          follow|               u|          62|
|  pos|httptcorcvcyyoiq|          follow|               u|            back|          62|
|  pos|          follow|         jnlazts|             amp|httptcorcvcyyoiq|          62|
|  pos|         jnlazts|             amp|httptcorcvcyyoiq|          follow|          62|
|  neg|        followed|          thanks| andjustinbieber|          please|          51|
|  neg|          thanks| andjustinbieber|          please|        followed|          51|
|  pos|              hi|             bam|   barsandmelody|          follow|          44|
|  pos|           horan|           loves|             lot|             see|          44|
|  pos|             s

In [73]:
spark.sql("""
SELECT DISTINCT w1, w2, w3, w4
FROM (
    SELECT
        word AS w1,
        LEAD(word, 1) OVER(ORDER BY id) AS w2,
        LEAD(word, 2) OVER(ORDER BY id) AS w3,
        LEAD(word, 3) OVER(ORDER BY id) AS w4
    FROM WindowTutorial
)
ORDER BY w1 DESC, w2, w3, w4
""").show()

+------------+-------------+--------------------+--------+
|          w1|           w2|                  w3|      w4|
+------------+-------------+--------------------+--------+
|zzzzzzplease|         dont|                 let|     sun|
|        zzzz|       missed|                stop|    take|
|   zzzterror|        didnt|                read|donation|
|         zzz|      careful|            actually|   black|
|         zzz|           xx|       physiotherapy|  friday|
|          zz|      airport|            straight|    away|
|     zysuzyy|       thanks|                  zy|    lets|
|    zynovftw|  mrprowestie|         twoscotsmen|  simple|
|          zy|         lets|              friend|     yaa|
|     zxwlfxz|           hi|              adrian|  parcel|
|    zupiapre|unfortunately|                 yes|    eveh|
|       zumba|    somewhere|                else|  please|
|   zulbayarb|         masa|nowadayshttptcoif...| brainer|
|     zozeebo|       honest|                miss|   duba

In [93]:
spark.sql("""
WITH subquery_cte AS (
    SELECT label, w1, w2, w3, w4, COUNT(1) AS phrase_count 
    FROM (
        SELECT
            label,
            word AS w1,
            LEAD(word, 1) OVER(ORDER BY id) AS w2,
            LEAD(word, 2) OVER(ORDER BY id) AS w3,
            LEAD(word, 3) OVER(ORDER BY id) AS w4
        FROM WindowTutorial
    )
    GROUP BY label, w1, w2, w3, w4
)
SELECT label, w1, w2, w3, w4, phrase_count
FROM (
    SELECT
        label,
        ROW_NUMBER() OVER(PARTITION BY label ORDER BY phrase_count DESC) AS row,
        w1, w2, w3, w4, phrase_count
    FROM subquery_cte
)
WHERE row = 1
ORDER BY label ASC
""").show()

+-----+--------+-------+---------------+----------------+------------+
|label|      w1|     w2|             w3|              w4|phrase_count|
+-----+--------+-------+---------------+----------------+------------+
|  neg|followed| thanks|andjustinbieber|          please|          51|
|  pos|  follow|jnlazts|            amp|httptcorcvcyyoiq|          62|
+-----+--------+-------+---------------+----------------+------------+

