# Pyspark exploration

## Import dataset and setup the environment 

In [52]:
from pyspark import SparkContext
import os 

os.environ['JAVA_HOME']="/Library/Java/JavaVirtualMachines/jdk1.8.0_202.jdk/Contents/Home/"

os.environ['PYSPARK_SUBMIT_ARGS'] = "--master local[2] pyspark-shell"

sc = SparkContext.getOrCreate()

In [53]:
data_file = "../data/labelled_dataset.csv.gz"
raw_data = sc.textFile(data_file).cache()

In [57]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()

In [58]:
df = spark.read.csv(data_file,header=True,inferSchema=True)

## DataFrame operations

### columns name

In [59]:
df.columns

['label', 'txt']

### Select column

In [72]:
df.txt

Column<b'txt'>

### Print Schema

In [60]:
df.printSchema()

root
 |-- label: string (nullable = true)
 |-- txt: string (nullable = true)



### Head 5

In [61]:
# Didn't strictly need a for loop, could have just then head()
for row in df.head(5):
    print(row)
    print('\n')

Row(label='ham', txt='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')


Row(label='ham', txt='Ok lar... Joking wif u oni...')


Row(label='spam', txt="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's")


Row(label='ham', txt='U dun say so early hor... U c already then say...')


Row(label='ham', txt="Nah I don't think he goes to usf, he lives around here though")




In [62]:
df.describe().show()

+-------+--------------------+------------------+
|summary|               label|               txt|
+-------+--------------------+------------------+
|  count|             1970119|            840730|
|   mean|            Infinity|1023.1263393359594|
| stddev|                 NaN|  8450.10290912209|
|    min|                   !|                  |
|    max|⸪ Great Allowance...|        ”” said he|
+-------+--------------------+------------------+



### Count

In [64]:
df.filter("label == 'spam'").count()

877

In [66]:
df.filter("label == 'books'").count()

1009

In [102]:
# https://stackoverflow.com/questions/48927271/count-number-of-words-in-a-spark-dataframe
import pyspark.sql.functions as f
data = df

### word count

In [114]:
# word count for each row
df = df.withColumn('wordCount', f.size(f.split(f.col('txt'), ' ')))
df.show()

+-----+--------------------+---------+
|label|                 txt|wordCount|
+-----+--------------------+---------+
|  ham|Go until jurong p...|       20|
|  ham|Ok lar... Joking ...|        6|
| spam|Free entry in 2 a...|       28|
|  ham|U dun say so earl...|       11|
|  ham|Nah I don't think...|       13|
| spam|FreeMsg Hey there...|       32|
|  ham|Even my brother i...|       16|
|  ham|As per your reque...|       26|
| spam|WINNER!! As a val...|       26|
| spam|Had your mobile 1...|       29|
|  ham|I'm gonna be home...|       21|
| spam|SIX chances to wi...|       26|
| spam|URGENT! You have ...|       26|
|  ham|I've been searchi...|       37|
|  ham|I HAVE A DATE ON ...|        8|
| spam|XXXMobileMovieClu...|       19|
|  ham|Oh k...i'm watchi...|        4|
|  ham|Eh u remember how...|       19|
|  ham|Fine if thatåÕs t...|       13|
| spam|England v Macedon...|       24|
+-----+--------------------+---------+
only showing top 20 rows



In [115]:
# total word
df.select(f.sum('wordCount')).collect() 

[Row(sum(wordCount)=5584925)]

In [124]:
# see what type df is
type(df)

pyspark.sql.dataframe.DataFrame

In [128]:
# first 5 row
df.take(5)

[Row(label='ham', txt='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', wordCount=20),
 Row(label='ham', txt='Ok lar... Joking wif u oni...', wordCount=6),
 Row(label='spam', txt="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", wordCount=28),
 Row(label='ham', txt='U dun say so early hor... U c already then say...', wordCount=11),
 Row(label='ham', txt="Nah I don't think he goes to usf, he lives around here though", wordCount=13)]

In [154]:
df.show()

+-----+--------------------+---------+
|label|                 txt|wordCount|
+-----+--------------------+---------+
|  ham|Go until jurong p...|       20|
|  ham|Ok lar... Joking ...|        6|
| spam|Free entry in 2 a...|       28|
|  ham|U dun say so earl...|       11|
|  ham|Nah I don't think...|       13|
| spam|FreeMsg Hey there...|       32|
|  ham|Even my brother i...|       16|
|  ham|As per your reque...|       26|
| spam|WINNER!! As a val...|       26|
| spam|Had your mobile 1...|       29|
|  ham|I'm gonna be home...|       21|
| spam|SIX chances to wi...|       26|
| spam|URGENT! You have ...|       26|
|  ham|I've been searchi...|       37|
|  ham|I HAVE A DATE ON ...|        8|
| spam|XXXMobileMovieClu...|       19|
|  ham|Oh k...i'm watchi...|        4|
|  ham|Eh u remember how...|       19|
|  ham|Fine if thatåÕs t...|       13|
| spam|England v Macedon...|       24|
+-----+--------------------+---------+
only showing top 20 rows



### word count collection

In [155]:
df.withColumn('word', f.explode(f.split(f.col('txt'), ' ')))\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)\
    .show()

+----+------+
|word| count|
+----+------+
|    |784389|
| the|282718|
| and|194113|
|  of|163358|
|  to|145701|
|   a|103066|
|  in| 90602|
|   I| 88815|
|that| 72731|
|  he| 53735|
| his| 48351|
|  it| 46497|
|  as| 46428|
|with| 45537|
| was| 45149|
|  is| 43232|
| you| 42713|
| for| 41989|
|  my| 39440|
|  be| 37498|
+----+------+
only showing top 20 rows



## random try

In [135]:
df.withColumn('list', df['txt'].split(" ")).show()

TypeError: 'Column' object is not callable

In [141]:
split_col = f.split(df['txt'], ' ')

In [152]:
data = data.withColumn('NAME1', split_col.getItem(0))
data = data.withColumn('NAME2', split_col.getItem(1))



In [153]:
data.show()

+-----+--------------------+-------------------+-------+
|label|                 txt|              NAME1|  NAME2|
+-----+--------------------+-------------------+-------+
|  ham|Go until jurong p...|                 Go|  until|
|  ham|Ok lar... Joking ...|                 Ok| lar...|
| spam|Free entry in 2 a...|               Free|  entry|
|  ham|U dun say so earl...|                  U|    dun|
|  ham|Nah I don't think...|                Nah|      I|
| spam|FreeMsg Hey there...|            FreeMsg|    Hey|
|  ham|Even my brother i...|               Even|     my|
|  ham|As per your reque...|                 As|    per|
| spam|WINNER!! As a val...|           WINNER!!|     As|
| spam|Had your mobile 1...|                Had|   your|
|  ham|I'm gonna be home...|                I'm|  gonna|
| spam|SIX chances to wi...|                SIX|chances|
| spam|URGENT! You have ...|            URGENT!|    You|
|  ham|I've been searchi...|               I've|   been|
|  ham|I HAVE A DATE ON ...|   

In [116]:
df.withColumn('word', f.explode(f.split(f.col('txt'), ' ')))\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)\
    .show()

+----+------+
|word| count|
+----+------+
|    |784389|
| the|282718|
| and|194113|
|  of|163358|
|  to|145701|
|   a|103066|
|  in| 90602|
|   I| 88815|
|that| 72731|
|  he| 53735|
| his| 48351|
|  it| 46497|
|  as| 46428|
|with| 45537|
| was| 45149|
|  is| 43232|
| you| 42713|
| for| 41989|
|  my| 39440|
|  be| 37498|
+----+------+
only showing top 20 rows



In [117]:
from pyspark.ml.feature import StopWordsRemover


In [95]:
text_file = sc.textFile("../books/*.txt")


In [91]:
text_file.count()

6003218