In [None]:
# Make sure you have run these commands to set everything up first
# conda activate py37
# conda install pyspark python-hdfs
# cd ~/projects
# git clone https://github.com/sstirlin/docker-spark.git
# cd docker-spark
# docker-compose up -d
# bash expose_hostnames.sh

In [1]:
from hdfs import InsecureClient

In [2]:
client = InsecureClient('http://namenode:50070', user='root')
client.makedirs('/Users')
client.makedirs('/Users/vagrant')
client.set_owner('/Users/vagrant', owner='vagrant', group='vagrant')

In [3]:
client = InsecureClient('http://namenode:50070', user='vagrant')
client.upload('/Users/vagrant/','../python_tutorial/Twitter-sentiment-self-drive-DFE.csv', )

'/Users/vagrant/Twitter-sentiment-self-drive-DFE.csv'

In [4]:
client.list('/Users/vagrant/')

['Twitter-sentiment-self-drive-DFE.csv']

In [1]:
from pyspark import SparkContext

sc = SparkContext('spark://spark-master:7077', 'Test App')

In [22]:
# create an RDD
rdd_tweets = sc.textFile('hdfs://namenode/Users/vagrant/Twitter-sentiment-self-drive-DFE.csv')

# no transformations right now

# perform action to bring results back to the driver
rdd_tweets.first()

'_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,sentiment,sentiment:confidence,our_id,sentiment_gold,sentiment_gold_reason,text'

In [7]:
# temperature conversion Celsius to Kelvin

# create RDD from driver list
temp_C = [10, 3, -5, 25, 1, 9, 29, -10, 5]
rdd_temp_C = sc.parallelize(temp_C)

# perform transformation
rdd_temp_K = rdd_temp_C.map(lambda x: x + 273.15)

# perform action
print(rdd_temp_K.take(3))

[283.15, 276.15, 268.15]


In [8]:
# we define a list of integers
numbers = [i for i in range(0, 10)]

# define the RDD with 1 partition
rdd_numbers=sc.parallelize(numbers, 1)

# Use reduce to combine numbers
rdd_numbers_reduced = rdd_numbers.reduce(lambda x,y: "(" + str(x) + ", " + str(y) + ")")
print(rdd_numbers_reduced)

(((((((((0, 1), 2), 3), 4), 5), 6), 7), 8), 9)


In [9]:
# NOTICE what happens when we partition differently.
# This is why we need our "reduce" operation to be associative (and commutative)!

# define the RDD with 1 partition
rdd_numbers=sc.parallelize(numbers, 2)

# Use reduce to combine numbers
rdd_numbers_reduced = rdd_numbers.reduce(lambda x,y: "(" + str(x) + ", " + str(y) + ")")
print(rdd_numbers_reduced)

(((((0, 1), 2), 3), 4), ((((5, 6), 7), 8), 9))


In [10]:
# functions passed to foreach are applied on the workers, NOT on the driver
# the following prints nothing here (the 'print' happens on the workers)
rdd_numbers.foreach(print)

In [19]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

df = sqlContext.read.csv('hdfs://namenode/Users/vagrant/Twitter-sentiment-self-drive-DFE.csv', encoding='ISO-8859-1', header=True)

In [20]:
df.take(5)

[Row(_unit_id='724227031', _golden='TRUE', _unit_state='golden', _trusted_judgments='236', _last_judgment_at=None, sentiment='5', sentiment:confidence='0.7579', our_id='10001', sentiment_gold='5', sentiment_gold_reason=None, text=None),
 Row(_unit_id='4"', _golden='Author is excited about the development of the technology.', _unit_state="Two places I'd invest all my money if I could: 3D printing and Self-driving cars!!!", _trusted_judgments=None, _last_judgment_at=None, sentiment=None, sentiment:confidence=None, our_id=None, sentiment_gold=None, sentiment_gold_reason=None, text=None),
 Row(_unit_id='724227032', _golden='TRUE', _unit_state='golden', _trusted_judgments='231', _last_judgment_at=None, sentiment='5', sentiment:confidence='0.8775', our_id='10002', sentiment_gold='5', sentiment_gold_reason=None, text=None),
 Row(_unit_id='4"', _golden='Author is excited that driverless cars will benefit the disabled.', _unit_state='Awesome! Google driverless cars will help the blind travel mo

In [21]:
df.columns

['_unit_id',
 '_golden',
 '_unit_state',
 '_trusted_judgments',
 '_last_judgment_at',
 'sentiment',
 'sentiment:confidence',
 'our_id',
 'sentiment_gold',
 'sentiment_gold_reason',
 'text']

In [28]:
# Do a little cleanup to free some memory
del rdd_tweets
del rdd_temp_C
del rdd_temp_K
del rdd_numbers
del rdd_numbers_reduced
sc.stop()
sc = SparkContext('spark://spark-master:7077', 'Test App')