In [312]:
import findspark
findspark.init()

import webbrowser

from pyspark.sql import SparkSession
import nltk
from nltk.corpus import twitter_samples
import pandas as pd

from pyspark import StorageLevel
from pyspark.sql.functions import split, explode, regexp_replace, lower, regexp_extract
from pyspark.sql.functions import row_number, monotonically_increasing_id as identity, col, length
from pyspark.sql.functions import lag, lead, udf
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, StringType

# Creating Spark Session

In [2]:
spark = SparkSession.builder.appName('SparkNlp').getOrCreate()
webbrowser.open('http://localhost:4040')

True

# Downloading the Twitter Sample Data

In [3]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\sammy\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

# Extracting Twitter Positive and Negative Data

In [4]:
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [5]:
# positive and negative documents
pos_tw = [(t, 'pos') for t in twitter_samples.strings('positive_tweets.json')]
neg_tw = [(t, 'neg') for t in twitter_samples.strings('negative_tweets.json')]

# joining documents
document = [pos_tw] + [neg_tw]

# list to dataframe
df = pd.DataFrame(document[0]).append(pd.DataFrame(document[1])).rename(columns={0:'text', 1:'label'})

# Create Resilient Distributed Datastore

In [357]:
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
df_rdd = spark.createDataFrame(df)

In [358]:
df_rdd.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: string (nullable = true)



In [359]:
# creates a job
df_rdd.show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------+-----+
|text                                                                                                                          |label|
+------------------------------------------------------------------------------------------------------------------------------+-----+
|#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)               |pos  |
|@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!|pos  |
|@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!                   |pos  |
|@97sides CONGRATS :)                                                                                                          |pos  |
|yeaaaah yippppy!!!  my accnt verified rqst has succeed

# Create Temp Table Reference

In [360]:
df_rdd.createOrReplaceTempView('SqlNlp')

In [361]:
spark.catalog.listTables()

[Table(name='sqlnlp', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [362]:
spark.sql("SHOW TABLES").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   sqlnlp|       true|
+--------+---------+-----------+



# Exploring the Text Data

In [363]:
df_rdd.select('text').show(4, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                          |
+------------------------------------------------------------------------------------------------------------------------------+
|#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)               |
|@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!|
|@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!                   |
|@97sides CONGRATS :)                                                                                                          |
+------------------------------------------------------------------------------------------------

# Cleaning the Text Data

In [364]:
smilies = [':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3', ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';(', '(', ')', 'via']

# Adding Sentence Identity

In [365]:
w = Window.partitionBy('label').orderBy('text')
df_rdd = df_rdd.withColumn('sentence_id', row_number().over(w))

## Cleaning Invalid Characters

In [366]:
df_clean = df_rdd.withColumn('clean_text', lower(regexp_replace('text', '[^a-zA-Z#@ ]', '')))
df_clean.show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+-----+-----------+----------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                      |label|sentence_id|clean_text                                                                                                                  |
+------------------------------------------------------------------------------------------------------------------------------------------+-----+-----------+----------------------------------------------------------------------------------------------------------------------------+
|!! Quick notice regarding requests. Our DM is now open for people to request moments/ideas for tweets, thank you :) http://t.co/joEpeCsq29|pos  |1 

## Splitting the Words

In [368]:
df_split = df_clean.select(split('clean_text', ' ').alias('words'), 'label', 'sentence_id')
df_split.show(4, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------+-----+-----------+
|words                                                                                                                                             |label|sentence_id|
+--------------------------------------------------------------------------------------------------------------------------------------------------+-----+-----------+
|[, quick, notice, regarding, requests, our, dm, is, now, open, for, people, to, request, momentsideas, for, tweets, thank, you, , httptcojoepecsq]|pos  |1          |
|[@cassthetrainer, a, huge, amp, warm, welcome, to, @vodkablond, , finally, we, complete, the, triangle, , ]                                       |pos  |2          |
|[@cowokaddict, mama, is, the, only, reason, why, i, stand, stronger, up, to, now, ]                                                               |pos  |3          

# Removing Hash and User Tags

In [369]:
udf_cotains_hash_user = udf(lambda row: any([any([i in x for i in ['#', '@']]) for x in row]))
udf_contains_hash_only = udf(lambda row: any(['#' in x for x in row]))
udf_clear_hash = udf(lambda row: [x for x in row if '#' not in x], ArrayType(StringType(), True))
udf_clear_user = udf(lambda row: [x for x in row if '@' not in x], ArrayType(StringType(), True))

df_split = df_split\
    .withColumn('contain_tags', udf_cotains_hash_user('words'))\
    .withColumn('contain_hash_only', udf_contains_hash_only('words'))\
    .withColumn('words_clean', udf_clear_user(udf_clear_hash('words')))

In [370]:
df_split.printSchema()

root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- label: string (nullable = true)
 |-- sentence_id: integer (nullable = true)
 |-- contain_tags: string (nullable = true)
 |-- contain_hash_only: string (nullable = true)
 |-- words_clean: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [372]:
df_split.show()

+--------------------+-----+-----------+------------+-----------------+--------------------+
|               words|label|sentence_id|contain_tags|contain_hash_only|         words_clean|
+--------------------+-----+-----------+------------+-----------------+--------------------+
|[, quick, notice,...|  pos|          1|       false|            false|[, quick, notice,...|
|[@cassthetrainer,...|  pos|          2|        true|            false|[a, huge, amp, wa...|
|[@cowokaddict, ma...|  pos|          3|        true|            false|[mama, is, the, o...|
|[@katiiirocks, @m...|  pos|          4|        true|            false|[are, u, a, beaut...|
|[@manuellatchgn, ...|  pos|          5|        true|            false|[goodbye, twitter...|
|[@nyesekkinn, don...|  pos|          6|        true|            false|[dont, be, affara...|
|[@realliampayne, ...|  pos|          7|        true|            false|[yeah, thanks, fo...|
|[@southafrica, @c...|  pos|          8|        true|             true

## Exploding the Words into Column

In [390]:
df_exp = df_split.select(explode('words_clean').alias('word'), 'label', 'contain_tags', 'contain_hash_only', 'sentence_id')
df_exp.show()

+------------+-----+------------+-----------------+-----------+
|        word|label|contain_tags|contain_hash_only|sentence_id|
+------------+-----+------------+-----------------+-----------+
|            |  pos|       false|            false|          1|
|       quick|  pos|       false|            false|          1|
|      notice|  pos|       false|            false|          1|
|   regarding|  pos|       false|            false|          1|
|    requests|  pos|       false|            false|          1|
|         our|  pos|       false|            false|          1|
|          dm|  pos|       false|            false|          1|
|          is|  pos|       false|            false|          1|
|         now|  pos|       false|            false|          1|
|        open|  pos|       false|            false|          1|
|         for|  pos|       false|            false|          1|
|      people|  pos|       false|            false|          1|
|          to|  pos|       false|       

In [None]:
# cache in memory to avoid lazy evaluation later
df_exp.cache()

## Filtering Blanks

In [377]:
df_exp = df_exp.filter(col('word')!='')
df_exp.show()

+---------+-----+------------+-----------------+
|     word|label|contain_tags|contain_hash_only|
+---------+-----+------------+-----------------+
|      for|  pos|        true|             true|
|    being|  pos|        true|             true|
|      top|  pos|        true|             true|
|  engaged|  pos|        true|             true|
|  members|  pos|        true|             true|
|       in|  pos|        true|             true|
|       my|  pos|        true|             true|
|community|  pos|        true|             true|
|     this|  pos|        true|             true|
|     week|  pos|        true|             true|
|      hey|  pos|        true|            false|
|    james|  pos|        true|            false|
|      how|  pos|        true|            false|
|      odd|  pos|        true|            false|
|   please|  pos|        true|            false|
|     call|  pos|        true|            false|
|      our|  pos|        true|            false|
|  contact|  pos|   

## Creating Word Identity

In [378]:
df_exp = df_exp.withColumn('id', identity())
df_exp.show()

+---------------+-----+------------+-----------------+------------+
|           word|label|contain_tags|contain_hash_only|          id|
+---------------+-----+------------+-----------------+------------+
|          quick|  pos|       false|            false|300647710720|
|         notice|  pos|       false|            false|300647710721|
|      regarding|  pos|       false|            false|300647710722|
|       requests|  pos|       false|            false|300647710723|
|            our|  pos|       false|            false|300647710724|
|             dm|  pos|       false|            false|300647710725|
|             is|  pos|       false|            false|300647710726|
|            now|  pos|       false|            false|300647710727|
|           open|  pos|       false|            false|300647710728|
|            for|  pos|       false|            false|300647710729|
|         people|  pos|       false|            false|300647710730|
|             to|  pos|       false|            

# Creating Sliding Window

In [380]:
df_exp.createOrReplaceTempView('WindowTutorial')

In [381]:
spark.sql("""
SELECT
    id,
    LAG(word, 1) OVER(ORDER BY id) AS w1,
    word,
    LEAD(word, 1) OVER(ORDER BY id) AS w2
FROM WindowTutorial
""").show()

+---+---------+---------+---------+
| id|       w1|     word|       w2|
+---+---------+---------+---------+
|  0|     null|      for|    being|
|  1|      for|    being|      top|
|  2|    being|      top|  engaged|
|  3|      top|  engaged|  members|
|  4|  engaged|  members|       in|
|  5|  members|       in|       my|
|  6|       in|       my|community|
|  7|       my|community|     this|
|  8|community|     this|     week|
|  9|     this|     week|      hey|
| 10|     week|      hey|    james|
| 11|      hey|    james|      how|
| 12|    james|      how|      odd|
| 13|      how|      odd|   please|
| 14|      odd|   please|     call|
| 15|   please|     call|      our|
| 16|     call|      our|  contact|
| 17|      our|  contact|   centre|
| 18|  contact|   centre|       on|
| 19|   centre|       on|      and|
+---+---------+---------+---------+
only showing top 20 rows



In [382]:
w = Window.orderBy('id')
df_exp.select(
    'id',
    lag('word', 1).over(w).alias('w1'),
    'word',
    lead('word', 1).over(w).alias('w2')
).show()

+---+---------+---------+---------+
| id|       w1|     word|       w2|
+---+---------+---------+---------+
|  0|     null|      for|    being|
|  1|      for|    being|      top|
|  2|    being|      top|  engaged|
|  3|      top|  engaged|  members|
|  4|  engaged|  members|       in|
|  5|  members|       in|       my|
|  6|       in|       my|community|
|  7|       my|community|     this|
|  8|community|     this|     week|
|  9|     this|     week|      hey|
| 10|     week|      hey|    james|
| 11|      hey|    james|      how|
| 12|    james|      how|      odd|
| 13|      how|      odd|   please|
| 14|      odd|   please|     call|
| 15|   please|     call|      our|
| 16|     call|      our|  contact|
| 17|      our|  contact|   centre|
| 18|  contact|   centre|       on|
| 19|   centre|       on|      and|
+---+---------+---------+---------+
only showing top 20 rows



# Sliding Window as Subquery: Most common 3-tuples

In [383]:
spark.sql("""
SELECT label, w1, w2, w3, w4, COUNT(1) AS phrase_count 
FROM (
    SELECT
        label,
        word AS w1,
        LEAD(word, 1) OVER(ORDER BY id) AS w2,
        LEAD(word, 2) OVER(ORDER BY id) AS w3,
        LEAD(word, 3) OVER(ORDER BY id) AS w4
    FROM WindowTutorial
)
GROUP BY label, w1, w2, w3, w4
ORDER BY COUNT(1) DESC
""").show()

+-----+----------------+----------------+----------------+----------+------------+
|label|              w1|              w2|              w3|        w4|phrase_count|
+-----+----------------+----------------+----------------+----------+------------+
|  pos|httptcorcvcyyoiq|          follow|               u|      back|          62|
|  pos|             amp|httptcorcvcyyoiq|          follow|         u|          62|
|  pos|          follow|             amp|httptcorcvcyyoiq|    follow|          62|
|  pos|             and|              no|     unfollowers|       via|          60|
|  neg|        followed|              me|          thanks|    please|          51|
|  neg|              me|          thanks|          please|  followed|          51|
|  neg|          thanks|          please|        followed|        me|          51|
|  neg|          please|        followed|              me|       too|          51|
|  pos|             she|           loves|             you|         a|          44|
|  p

In [384]:
spark.sql("""
SELECT DISTINCT w1, w2, w3, w4
FROM (
    SELECT
        word AS w1,
        LEAD(word, 1) OVER(ORDER BY id) AS w2,
        LEAD(word, 2) OVER(ORDER BY id) AS w3,
        LEAD(word, 3) OVER(ORDER BY id) AS w4
    FROM WindowTutorial
)
ORDER BY w1 DESC, w2, w3, w4
""").show()

+------------+---------+-------------+-----------+
|          w1|       w2|           w3|         w4|
+------------+---------+-------------+-----------+
|zzzzzzplease|     dont|          let|        the|
|        zzzz|   missed|           my|       stop|
|         zzz|      how|      careful|         do|
|         zzz|       xx|physiotherapy|     friday|
|          zz|     from|      airport|   straight|
|          zy|     lets|           be|     friend|
|       zumba|somewhere|         else|     please|
|   zopiclone|      for|        sleep|         in|
|      zoomed|     away|       damnit|          i|
|        zoom|     into|         hers|       lmao|
|         zoo|      and|          its|       only|
|         zoo|     baby|           gt|  boyfriend|
|    zonzofox|      app|        using|       this|
|    zonzofox| clicking|         this|       link|
|        zone|      and|            i|      could|
|        zone|       my|     snapchat|leanneriner|
|       zokay|  russian|       

In [385]:
spark.sql("""
WITH subquery_cte AS (
    SELECT label, w1, w2, w3, w4, COUNT(1) AS phrase_count 
    FROM (
        SELECT
            label,
            word AS w1,
            LEAD(word, 1) OVER(ORDER BY id) AS w2,
            LEAD(word, 2) OVER(ORDER BY id) AS w3,
            LEAD(word, 3) OVER(ORDER BY id) AS w4
        FROM WindowTutorial
    )
    GROUP BY label, w1, w2, w3, w4
)
SELECT label, w1, w2, w3, w4, phrase_count
FROM (
    SELECT
        label,
        ROW_NUMBER() OVER(PARTITION BY label ORDER BY phrase_count DESC) AS row,
        w1, w2, w3, w4, phrase_count
    FROM subquery_cte
)
WHERE row = 1
ORDER BY label ASC
""").show()

+-----+--------+---+----------------+------+------------+
|label|      w1| w2|              w3|    w4|phrase_count|
+-----+--------+---+----------------+------+------------+
|  neg|followed| me|          thanks|please|          51|
|  pos|  follow|amp|httptcorcvcyyoiq|follow|          62|
+-----+--------+---+----------------+------+------------+



In [388]:
df_exp.cache()

DataFrame[word: string, label: string, contain_tags: string, contain_hash_only: string, id: bigint]

In [389]:
df_exp.filter(col('word')=='httptcorcvcyyoiq').show()

+----------------+-----+------------+-----------------+----+
|            word|label|contain_tags|contain_hash_only|  id|
+----------------+-----+------------+-----------------+----+
|httptcorcvcyyoiq|  pos|        true|            false| 161|
|httptcorcvcyyoiq|  pos|        true|            false| 393|
|httptcorcvcyyoiq|  pos|        true|            false| 649|
|httptcorcvcyyoiq|  pos|        true|            false| 927|
|httptcorcvcyyoiq|  pos|        true|            false|1177|
|httptcorcvcyyoiq|  pos|        true|            false|1391|
|httptcorcvcyyoiq|  pos|        true|            false|1549|
|httptcorcvcyyoiq|  pos|        true|            false|1764|
|httptcorcvcyyoiq|  pos|        true|            false|2017|
|httptcorcvcyyoiq|  pos|        true|            false|2406|
|httptcorcvcyyoiq|  pos|        true|            false|2618|
|httptcorcvcyyoiq|  pos|        true|            false|2837|
|httptcorcvcyyoiq|  pos|        true|            false|2983|
|httptcorcvcyyoiq|  pos|

## Removing Stopwords

In [64]:
nltk.download('stopwords')
from nltk.corpus import stopwords

stopset = set(stopwords.words('english'))
df_exp = df_exp.filter(~col('word').isin(stopset))
df_exp.show()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sammy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


+------------+-----+
|        word|label|
+------------+-----+
|followfriday|  pos|
|  franceinte|  pos|
|     pkuchly|  pos|
|milipolparis|  pos|
|         top|  pos|
|     engaged|  pos|
|     members|  pos|
|   community|  pos|
|        week|  pos|
|      lambja|  pos|
|         hey|  pos|
|       james|  pos|
|         odd|  pos|
|      please|  pos|
|        call|  pos|
|     contact|  pos|
|      centre|  pos|
|        able|  pos|
|      assist|  pos|
|        many|  pos|
+------------+-----+
only showing top 20 rows

