In [1]:
#  Get CWD
import os
cwd = os.getcwd()
print(f'CWD: {cwd}')
import sys

hadoop_home_path = f'{cwd}/windows/winutils/hadoop-3.0.0'
os.environ['HADOOP_HOME'] = hadoop_home_path
sys.path.append(f'{hadoop_home_path}/bin')

print('HADOOP_HOME ENV var: ', os.environ['HADOOP_HOME'])

CWD: f:\Projects\sparkimental
HADOOP_HOME ENV var:  f:\Projects\sparkimental/windows/winutils/hadoop-3.0.0


In [3]:
findspark.init()
findspark.find()

'f:\\AppSSD\\WorkTools\\Anaconda\\envs\\sparkimental\\lib\\site-packages\\pyspark'

In [2]:
# Imports
import findspark
from pyspark import SparkContext
from pyspark.sql.types import *
from pprint import pprint, pformat

In [76]:
# Setup spark environment
sc = SparkContext.getOrCreate()
print('Spark version: ', sc.version)
print(f'Hadoop version: {sc._jvm.org.apache.hadoop.util.VersionInfo.getVersion()}')
# print('Spark Config:')
# pprint(sc.getConf().getAll())


Spark version:  3.1.2
Hadoop version: 3.2.0


In [77]:
print('Spark web UI link: ', sc._jsc.sc().uiWebUrl().get())


Spark web UI link:  http://ProdudePredator.mshome.net:4040


In [59]:
# Extra import (move later)
from pyspark import SparkFiles
from pyspark import SQLContext

In [60]:
# add data file from local system to Spark's RDD
path = cwd.replace('\\', '/').replace('f:/','')
full_path = f'file:///{path}/data/animal-crossing.csv'
sc.addFile(full_path)

In [61]:
# Create sql context
sqlContext = SQLContext(sc)

In [62]:
# read file from hdfs/rdd
file_path = SparkFiles.get('animal-crossing.csv')
print('File path in Spark File system: ', file_path)
df = sqlContext.read.csv(path=file_path, header=True, inferSchema=True)

File path in Spark File system:  C:\Users\USER\AppData\Local\Temp\spark-3d41376e-a71c-4198-a853-ce367ea325ef\userFiles-e7101d4a-d39d-4f4a-90c8-c4776a2a3b6e\animal-crossing.csv


In [63]:
df.printSchema()

root
 |-- grade: integer (nullable = true)
 |-- publication: string (nullable = true)
 |-- text: string (nullable = true)
 |-- date: string (nullable = true)



In [64]:
# Type cast
df = df.withColumn('date', df.date.cast(DateType()))
df.printSchema()

root
 |-- grade: integer (nullable = true)
 |-- publication: string (nullable = true)
 |-- text: string (nullable = true)
 |-- date: date (nullable = true)



In [65]:
# Drop useless column
df = df.select(['date', 'text', 'grade'])
print(f'Data shape: {df.count(), len(df.columns)}')
# Show data samples
df.sample(0.1, seed=0).show(10)

Data shape: (107, 3)
+----------+--------------------+-----+
|      date|                text|grade|
+----------+--------------------+-----+
|2020-03-16|With a game this ...|  100|
|2020-03-16|Similar to how Br...|  100|
|2020-03-16|Animal Crossing: ...|   93|
|2020-03-16|New Horizons has ...|   90|
|2020-03-16|It’s a blissfully...|   80|
|2020-03-23|Animal Crossing N...|   80|
|2020-03-24|Animal Crossing: ...|   98|
|2020-03-26|Quotation forthco...|   90|
|2020-04-02|If Animal Crossin...|   90|
|2020-04-08|It's an exception...|   90|
+----------+--------------------+-----+



In [66]:
# Basic stats
df.describe().show()

+-------+--------------------+-----------------+
|summary|                text|            grade|
+-------+--------------------+-----------------+
|  count|                 107|              107|
|   mean|                null| 90.6355140186916|
| stddev|                null|6.114308185868841|
|    min|A beautiful, welc...|               70|
|    max|“New Horizons mak...|              100|
+-------+--------------------+-----------------+



In [67]:
# Nan check
from pyspark.sql.functions import isnan, when, count, col

print("Nan checking")
df.select(
    [
        count(
            when(
                isnan(c)
                | col(c).isNull()
                | (col(c) == "")
                | col(c).contains("None")
                | col(c).contains("Null"),
                c,
            )
        ).alias(c)
        for c in ["grade"]
    ]
).show()

df.select(
    [
        count(
            when(
                col(c).contains("None")
                | col(c).contains("NULL")
                | (col(c) == "")
                | col(c).isNull()
                | isnan(c),
                c,
            )
        ).alias(c)
        for c in ["text"]
    ]
).show()


Nan checking
+-----+
|grade|
+-----+
|    0|
+-----+

+----+
|text|
+----+
|   0|
+----+

