In [105]:
from pyspark.sql import Row
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType


In [146]:
# Extracting data from the textFile
sourcePath = "/Users/pravinkumar/Documents/Spark/testData/cards/"
textRDD = sc.textFile(sourcePath + "deckofcards.txt")

for i in textRDD.take(10): print(i)
print("*"*75)    

def fun(x):
    spade = 0
    club = 0
    diamond = 0
    heart = 0
    for i in x:
        if (i == "SPADE"):
            spade += 1
        elif (i == "CLUB"):
            club += 1
        elif (i == "DIAMOND"):
            diamond += 1
        elif (i == "HEART"):
            heart += 1
    return ({"SPADE": spade, "CLUB": club, "DIAMOND": diamond, "HEART": heart})

# Using GroupByKey        
textRDDGroupBy = textRDD.map(lambda rec: rec.split("|")).map(lambda rec: (rec[0], (rec[1]))).\
groupByKey().mapValues(fun)

for i in textRDDGroupBy.take(10): print(i)
print("*"*75)    

# Using ReduceByKey
textRDDReduceBy = textRDD.map(lambda rec: rec.split("|")).map(lambda rec: ((rec[0], rec[1]), 1)).\
reduceByKey(lambda agg,count: agg+count)

for i in textRDDReduceBy.take(10): print(i)
print("*"*75)    

# Word count in the file
wordCountRDD = textRDD.flatMap(lambda rec: rec.split("|")).map(lambda rec: (rec, 1)).\
reduceByKey(lambda agg, count: agg + count)

for i in wordCountRDD.take(10): print(i)
print("*"*75)    

BLACK|SPADE|2
BLACK|SPADE|3
BLACK|SPADE|4
BLACK|SPADE|5
BLACK|SPADE|6
BLACK|SPADE|7
BLACK|SPADE|8
BLACK|SPADE|9
BLACK|SPADE|10
BLACK|SPADE|J
***************************************************************************
('BLACK', {'CLUB': 13, 'DIAMOND': 0, 'SPADE': 13, 'HEART': 0})
('RED', {'CLUB': 0, 'DIAMOND': 13, 'SPADE': 0, 'HEART': 13})
***************************************************************************
(('BLACK', 'CLUB'), 13)
(('BLACK', 'SPADE'), 13)
(('RED', 'HEART'), 13)
(('RED', 'DIAMOND'), 13)
***************************************************************************
('10', 4)
('4', 4)
('CLUB', 13)
('SPADE', 13)
('J', 4)
('BLACK', 26)
('K', 4)
('RED', 26)
('8', 4)
('9', 4)
***************************************************************************


In [154]:
# Demonstrating saveAsSequenceFile and reading SequenceFile

# Saving the file in SequenceFile Format
Destinationpath = "/Users/pravinkumar/Documents/Spark/Results/"
textRDD.map(lambda x: (None, x)).saveAsSequenceFile(Destinationpath + "deckofcardsSeq")

# Reading the file in SequenceFile Format
SeqRDD = sc.sequenceFile(Destinationpath + "deckofcardsSeq")

for i in SeqRDD.take(10): print(i)
print("*"*75)   

# Demonstrating Saving and Reading SequenceFile Using saveAsNewAPIHadoopFile

# Saving the file in SequenceFile Format
textRDD.map(lambda x: (None, x)).saveAsNewAPIHadoopFile(Destinationpath + "deckofcardsSeqWithNewAPI","org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",\
                               keyClass=None,valueClass="org.apache.hadoop.io.Text")

# Reading the file in SequenceFile Format
largedeckSeqWithNewAPI = sc.sequenceFile(Destinationpath + "deckofcardsSeqWithNewAPI")

for i in largedeckSeqWithNewAPI.take(10): print(i)
print("*"*75) 

(None, 'BLACK|SPADE|2')
(None, 'BLACK|SPADE|3')
(None, 'BLACK|SPADE|4')
(None, 'BLACK|SPADE|5')
(None, 'BLACK|SPADE|6')
(None, 'BLACK|SPADE|7')
(None, 'BLACK|SPADE|8')
(None, 'BLACK|SPADE|9')
(None, 'BLACK|SPADE|10')
(None, 'BLACK|SPADE|J')
***************************************************************************
(None, 'BLACK|SPADE|2')
(None, 'BLACK|SPADE|3')
(None, 'BLACK|SPADE|4')
(None, 'BLACK|SPADE|5')
(None, 'BLACK|SPADE|6')
(None, 'BLACK|SPADE|7')
(None, 'BLACK|SPADE|8')
(None, 'BLACK|SPADE|9')
(None, 'BLACK|SPADE|10')
(None, 'BLACK|SPADE|J')
***************************************************************************


In [162]:
# Converting RDD to DF  
from pyspark.sql import Row
from pyspark.sql import types
#from pyspark.sql.types import StructField

# Method 1
schema = StructType([
    StructField("color", types.StringType(), True),
    StructField("symbol", types.StringType(), True),
    StructField("number", types.StringType(), True)
])
textRDDToDF01 = textRDD.map(lambda rec: rec.split("|"))
for i in textRDDToDF01.take(10): print(i)
print("*"*75)    
textDF = sqlContext.createDataFrame(textRDDToDF01, schema)
for i in textDF.take(10): print(i)
print("*"*75)        

# Method 2
textRDDToDF02 = textRDD.map(lambda rec: rec.split("|")).map(lambda rec: Row(color= rec[0], symbol= rec[1], \
                                                                            number= rec[2])).toDF()
#for i in textDF.take(10): print(i)
print(textRDDToDF02.printSchema())
textRDDToDF02.show()
print("*"*75)
    


['BLACK', 'SPADE', '2']
['BLACK', 'SPADE', '3']
['BLACK', 'SPADE', '4']
['BLACK', 'SPADE', '5']
['BLACK', 'SPADE', '6']
['BLACK', 'SPADE', '7']
['BLACK', 'SPADE', '8']
['BLACK', 'SPADE', '9']
['BLACK', 'SPADE', '10']
['BLACK', 'SPADE', 'J']
***************************************************************************
Row(color='BLACK', symbol='SPADE', number='2')
Row(color='BLACK', symbol='SPADE', number='3')
Row(color='BLACK', symbol='SPADE', number='4')
Row(color='BLACK', symbol='SPADE', number='5')
Row(color='BLACK', symbol='SPADE', number='6')
Row(color='BLACK', symbol='SPADE', number='7')
Row(color='BLACK', symbol='SPADE', number='8')
Row(color='BLACK', symbol='SPADE', number='9')
Row(color='BLACK', symbol='SPADE', number='10')
Row(color='BLACK', symbol='SPADE', number='J')
***************************************************************************
root
 |-- color: string (nullable = true)
 |-- number: string (nullable = true)
 |-- symbol: string (nullable = true)

None
+-----+-----

In [163]:
# Writing the file to CSV, JSON, ORC, Parquet from DataFrame

textDF.write.csv(Destinationpath + "deckofcardsCSV")
textDF.write.json(Destinationpath + "deckofcardsJSON")
textDF.write.orc(Destinationpath + "deckofcardsORC")
textDF.write.parquet(Destinationpath + "deckofcardsParquet")

In [193]:
Destinationpath = "/Users/pravinkumar/Documents/Spark/Results/"

largedeckCSVDF = sqlContext.read.csv(Destinationpath + "deckofcardsCSV")
largedeckCSVDF.printSchema()
largedeckCSVDF.show()
print("*"*75)

#largedeckJSONDF = sqlContext.read.json(Destinationpath + "deckofcardsJSON")
#largedeckJSONDF.printSchema()
#largedeckJSONDF.show()
#print("*"*75)

# Converting DF to RDD -> list(data)
largedeckJSONDF = sqlContext.read.json(Destinationpath + "deckofcardsJSON").rdd.map(list)
for i in largedeckJSONDF.take(10): print(i)
print("*"*75)

#largedeckORCDF = sqlContext.read.orc(Destinationpath + "deckofcardsORC")
#largedeckORCDF.printSchema()
#largedeckORCDF.show()
#print("*"*75)

#largedeckParquetDF = sqlContext.read.parquet(Destinationpath + "deckofcardsParquet")
#largedeckParquetDF.printSchema()
#largedeckParquetDF.show()
print("*"*75)

#sqlContext.read.format(Destinationpath +)
#sqlContext.read.load(Destinationpath +)


root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)

+-----+-----+---+
|  _c0|  _c1|_c2|
+-----+-----+---+
|BLACK|SPADE|  2|
|BLACK|SPADE|  3|
|BLACK|SPADE|  4|
|BLACK|SPADE|  5|
|BLACK|SPADE|  6|
|BLACK|SPADE|  7|
|BLACK|SPADE|  8|
|BLACK|SPADE|  9|
|BLACK|SPADE| 10|
|BLACK|SPADE|  J|
|BLACK|SPADE|  Q|
|BLACK|SPADE|  K|
|BLACK|SPADE|  A|
|BLACK| CLUB|  2|
|BLACK| CLUB|  3|
|BLACK| CLUB|  4|
|BLACK| CLUB|  5|
|BLACK| CLUB|  6|
|BLACK| CLUB|  7|
|BLACK| CLUB|  8|
+-----+-----+---+
only showing top 20 rows

***************************************************************************
['BLACK', '2', 'SPADE']
['BLACK', '3', 'SPADE']
['BLACK', '4', 'SPADE']
['BLACK', '5', 'SPADE']
['BLACK', '6', 'SPADE']
['BLACK', '7', 'SPADE']
['BLACK', '8', 'SPADE']
['BLACK', '9', 'SPADE']
['BLACK', '10', 'SPADE']
['BLACK', 'J', 'SPADE']
***************************************************************************
**************************************