In [14]:
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark=SparkSession.builder.appName('exer_spark').getOrCreate()

## Running the Average Friends by Age ##

In [32]:
f_data=sc.textFile('file:///D:\\spark-2.3.2-bin-hadoop2.7\\fakefriends.csv')
f_data.take(5)


['0,Will,33,385',
 '1,Jean-Luc,26,2',
 '2,Hugh,55,221',
 '3,Deanna,40,465',
 '4,Quark,68,21']

In [8]:
age_numf=f_data.map(lambda x:(x.split(',')[2],x.split(',')[3]))

In [31]:
age_num_map=age_numf.mapValues(lambda x: (x,1))
age_num_map.take(5)

[('33', ('385', 1)),
 ('26', ('2', 1)),
 ('55', ('221', 1)),
 ('40', ('465', 1)),
 ('68', ('21', 1))]

In [30]:
age_totalnum=age_num_map.reduceByKey(lambda x,y: (x[0] + y[0], x[1] + y[1]))
age_totalnum.take(5)

[('33', ('38574471275245356460294243463228410', 12)),
 ('26', ('228184282381145345293298492269254738312439184', 17)),
 ('40', ('4652544594071828438934940619817233567261286220', 17)),
 ('68', ('21264112490481217189206293423', 10)),
 ('54', ('30725375440744123536939746272442115', 13))]

In [24]:
average_age=age_totalnum.mapValues(lambda x:float(x[0])/x[1])

average_age.take(5)

[('33', 3.214539272937113e+33),
 ('26', 1.3422604845949725e+43),
 ('40', 2.736790937689311e+44),
 ('68', 2.1264112490481217e+27),
 ('54', 2.3634904185187785e+33)]

## wordcount of a text file with regex ##

In [296]:
lines=sc.textFile('file:///D:\\spark-2.3.2-bin-hadoop2.7\\Book.txt')
#words=lines.flatMap(lambda x:x.split())
#words.take(10) # there are many special characters and blanks (seen thru collect()),so we need to use regex to ensure only valid words are selected

        

['Self-Employment:',
 'Building',
 'an',
 'Internet',
 'Business',
 'of',
 'One',
 'Achieving',
 'Financial',
 'and']

In [309]:
import re

def regex_pr(line): # to filter special characters
    c=re.compile(r'\W+',re.UNICODE)
    return c.split(line.lower())  # after compiling regexp for detecting valid chars apply that to 
                                  # split line where capitals and smalls are treated equal. 
    #return re.split(c,line.lower())
    #re.split(<pattern,<text>) expects a text s argument.
    #re.compile(<pattern>,<flag>) and saving the resulting regular expression object for reuse is more efficient
    #when the expression will be used several times in a single program
    # re.UNICODE to specify the text has some unicode info . you can also write it as re.U.
 
   

In [310]:
words=lines.flatMap(regex_pr)  
words.take(10)

['self',
 'employment',
 'building',
 'an',
 'internet',
 'business',
 'of',
 'one',
 'achieving',
 'financial']

In [292]:
count_of_words=words.map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y)
count_of_words.take(5)


[('self', 111), ('an', 178), ('internet', 26), ('business', 383), ('of', 970)]

In [293]:
for c in count_of_words.take(5): # printing count of words
    print(c) 

('self', 111)
('an', 178)
('internet', 26)
('business', 383)
('of', 970)


In [102]:
sort_by_count=count_of_words.map(lambda x :(x[1],x[0])).sortByKey(False)
sort_by_count.first() # max count and related word

(1878, 'you')

In [294]:
for c in count_of_words.take(5): # formating result as word:count
    count = str(c[1])
    word = str(c[0])
    print(word+':'+count)


self:111
an:178
internet:26
business:383
of:970


## Find the Most Popular Superhero and coappearances in a Social Graph ##

In [157]:
def parseNames(line):
    fields = line.split('\"') #split on quotes to get key and name
    return (int(fields[0]), fields[1])

In [171]:
def countCoOccurences(line):
    #elements = line.strip().split()
    elements = line.split()
    return (int(elements[0]), len(elements) -1) # to subtract id (key) from list

In [163]:
names = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\marvel-names.txt")
names.take(5)

['1 "24-HOUR MAN/EMMANUEL"',
 '2 "3-D MAN/CHARLES CHAN"',
 '3 "4-D MAN/MERCURIO"',
 '4 "8-BALL/"',
 '5 "A"']

In [188]:
namesRdd = names.map(parseNames)
namesRdd.take(10) # contains key of superhero,name

[(1, '24-HOUR MAN/EMMANUEL'),
 (2, '3-D MAN/CHARLES CHAN'),
 (3, '4-D MAN/MERCURIO'),
 (4, '8-BALL/'),
 (5, 'A'),
 (6, "A'YIN"),
 (7, 'ABBOTT, JACK'),
 (8, 'ABCISSA'),
 (9, 'ABEL'),
 (10, 'ABOMINATION/EMIL BLO')]

In [164]:
lines = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\marvel-graph.txt")
lines.take(5)

['5988 748 1722 3752 4655 5743 1872 3413 5527 6368 6085 4319 4728 1636 2397 3364 4001 1614 1819 1585 732 2660 3952 2507 3891 2070 2239 2602 612 1352 5447 4548 1596 5488 1605 5517 11 479 2554 2043 17 865 4292 6312 473 534 1479 6375 4456 ',
 '5989 4080 4264 4446 3779 2430 2297 6169 3530 3272 4282 6432 2548 4140 185 105 3878 2429 1334 4595 2767 3956 3877 4776 4946 3407 128 269 5775 5121 481 5516 4758 4053 1044 1602 3889 1535 6038 533 3986 ',
 '5982 217 595 1194 3308 2940 1815 794 1503 5197 859 5096 6039 2664 651 2244 528 284 1449 1097 1172 1092 108 3405 5204 387 4607 4545 3705 4930 1805 4712 4404 247 4754 4427 1845 536 5795 5978 533 3984 6056 ',
 '5983 1165 3836 4361 1282 716 4289 4646 6300 5084 2397 4454 1913 5861 5485 ',
 '5980 2731 3712 1587 6084 2472 2546 6313 875 859 323 2664 1469 522 2506 2919 2423 3624 5736 5046 1787 5776 3245 3840 2399 ']

In [172]:
pairings = lines.map(countCoOccurences)
pairings.take(10)

[(5988, 48),
 (5989, 40),
 (5982, 42),
 (5983, 14),
 (5980, 24),
 (5981, 17),
 (5986, 142),
 (5987, 81),
 (5984, 41),
 (5985, 19)]

In [177]:
totalFriendsByCharacter = pairings.reduceByKey(lambda x, y : x + y)


[(48, 5988),
 (42, 5982),
 (24, 5980),
 (142, 5986),
 (41, 5984),
 (13, 6294),
 (42, 270),
 (45, 272),
 (410, 274),
 (15, 276)]

In [313]:
mostpopular=totalFriendsByCharacter.sortBy(lambda x:x[1],False).first() #sort value for each key in descending order
mostpopular #key of superhero,occurences

(859, 1933)

In [214]:
# alternate way of finding max value of key is using max()

flipped = totalFriendsByCharacter.map(lambda x : (x[1],x[0]))
flipped.take(10)
mostpopular1=flipped.max() # max number of coappearances ,max() operate on key of k,v pair or max(<key function>) where key can be changed
mostpopular1 #contains count of friends,key of superhero

(1933, 859)

In [218]:
# to get matching name from id (key

#for name in namesRdd.collect():
    #if mostpopular1[1]==name[0]: # mostpopular[0]==name[0]:
       #print(name[1])
        
#alternate way 

print('popular name is '+namesRdd.lookup(mostpopular[0])[0]+' with coappearances='+ str(mostpopular[1])) # lookup(<value>),to lookup for a value in rdd. Return list of matches if value found
                                                                                                         #[0] after lookup() - to display value of list

#print('popular name is '+namesRdd.lookup(mostpopular1[1])[0]+' with coappearances='+ str(mostpopular1[0]))




popular name is CAPTAIN AMERICA with coappearances=1933


## Total-Amount-By-Customer##

In [219]:
cust = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\customer-orders.csv")

In [226]:
cust.take(5)


['44,8602,37.19',
 '35,5368,65.89',
 '2,3391,40.64',
 '47,6694,14.98',
 '29,680,13.08']

In [241]:
def parseLine(line):
    fields = line.split(',')
    customerId = int(fields[0])
    itemId = fields[1]
    itemAmt = float(fields[2])
    return (customerId, itemAmt)

In [257]:
#orders = cust.map(parseLine)
id_amt=cust.map(lambda line: (int(line.split(',')[0]),float(line.split(',')[2])))
totals = id_amt.reduceByKey(lambda x, y: x + y)
totals.take(5)

[(44, 4756.890000000001),
 (2, 5994.59),
 (70, 5368.249999999999),
 (14, 4735.030000000001),
 (42, 5696.840000000002)]

In [278]:
for result in totals.take(5):
    print("{:.2f}".format(result[1]),"\t{:.2f}".format(result[0])) #:.2f for formatting in placeholder and then using format() to display result

4756.89 	44.00
5994.59 	2.00
5368.25 	70.00
4735.03 	14.00
5696.84 	42.00


## Min/Max -Temperatures per station ID## <same logic formax,below is only for min>

In [316]:
lines = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\1800.csv")
lines.take(5)

['ITE00100554,18000101,TMAX,-75,,,E,',
 'ITE00100554,18000101,TMIN,-148,,,E,',
 'GM000010962,18000101,PRCP,0,,,E,',
 'EZE00100082,18000101,TMAX,-86,,,E,',
 'EZE00100082,18000101,TMIN,-135,,,E,']

In [333]:
# To take off required fields and assigning them to variable,create function where split line on a pattern and assign variables to reqd fields.

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)


In [335]:
parsedLines=lines.map(parseLine) # use the function to actual line element of rdd
parsedLines.take(5)

[('ITE00100554', 'TMAX', 18.5),
 ('ITE00100554', 'TMIN', 5.359999999999999),
 ('GM000010962', 'PRCP', 32.0),
 ('EZE00100082', 'TMAX', 16.52),
 ('EZE00100082', 'TMIN', 7.699999999999999)]

In [359]:
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1]) # check if TMIN as a 1st pos value in each element of rdd,hence use filter() to filter out such elements of rdd
minTemps.take(5)

[('ITE00100554', 'TMIN', 5.359999999999999),
 ('EZE00100082', 'TMIN', 7.699999999999999),
 ('ITE00100554', 'TMIN', 9.5),
 ('EZE00100082', 'TMIN', 8.599999999999998),
 ('ITE00100554', 'TMIN', 23.72)]

In [360]:
id_temp=minTemps.map(lambda x:(x[0],x[2]))
id_temp.take(5)

[('ITE00100554', 5.359999999999999),
 ('EZE00100082', 7.699999999999999),
 ('ITE00100554', 9.5),
 ('EZE00100082', 8.599999999999998),
 ('ITE00100554', 23.72)]

In [361]:
minTemps = id_temp.reduceByKey(lambda x,y: x if x<y else y) # to calculate min of values (temps) per key
minTemps.collect()

[('ITE00100554', 5.359999999999999), ('EZE00100082', 7.699999999999999)]

In [376]:
for m in minTemps.collect(): #display min temperature per station
    print('station Id '+m[0]+' has min temp of {:.2f}F'.format(m[1]))
    
#to get lowest temperature and related station ID and of all stations.

m=minTemps.min(lambda x:x[1]) # to get min temperature pair

print('\nstation Id '+m[0]+' has lowest temp of {:.2f}F '.format(m[1])+'of all stations')

station Id ITE00100554 has min temp of 5.36F
station Id EZE00100082 has min temp of 7.70F

station Id ITE00100554 has lowest temp of 5.36F of all stations


## some more spark opertions on a csv file ##

In [391]:
autoData = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\auto-data.csv")
autoData.cache() #action (to uncache use autoData.unpersist())
autoData.is_cached #to check id rdd is cached #action

True

In [398]:
autoData.take(5) #action

['MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,two,hatchback,fwd,four,69,4900,31,36,5118',
 'chevrolet,gas,std,two,hatchback,fwd,three,48,5100,47,53,5151',
 'mazda,gas,std,two,hatchback,fwd,four,68,5000,30,31,5195',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348']

In [403]:
autoData.count() #action

198

In [402]:
autoData.first() #action

'MAKE,FUELTYPE,ASPIRE,DOORS,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE'

In [413]:
#autoData.saveAsTextFile('file:///D:\\spark-2.3.2-bin-hadoop2.7\\auto-data-saved') # this will create a folder (auto-data-saved) auto-data-saved with multiple part text files

autoData.coalesce(1).saveAsTextFile('file:///D:\\spark-2.3.2-bin-hadoop2.7\\auto-data-saved.csv') # saveAsTextFile(<ouputfolder>)

#to avoid multiple part test files being created instead 1 part text file in auto-data-saved folder,
#use coalesce(<num of partitions>) to combine parttions together in specified num of partitions.
# Do not repartition() as that will cause shuffle hence expensive



In [422]:
# saving rdd to file without creating folder and then file

f=open("D:\\auto-data-saved.csv","w") # create new empty file in write mode

f.write("\n".join(autoData.collect())) # <str1>.join(<iterable>) will join str1 with each element of iterable.In this case \n to move each element of list to new row
#autoDatacollect() will collect rdd to master and cerate list of elements and then join it with empty file 

f.close() 

In [423]:
tsvData=autoData.map(lambda x : x.replace(",","\t")) #replace() in map()
tsvData.take(5)

['MAKE\tFUELTYPE\tASPIRE\tDOORS\tBODY\tDRIVE\tCYLINDERS\tHP\tRPM\tMPG-CITY\tMPG-HWY\tPRICE',
 'subaru\tgas\tstd\ttwo\thatchback\tfwd\tfour\t69\t4900\t31\t36\t5118',
 'chevrolet\tgas\tstd\ttwo\thatchback\tfwd\tthree\t48\t5100\t47\t53\t5151',
 'mazda\tgas\tstd\ttwo\thatchback\tfwd\tfour\t68\t5000\t30\t31\t5195',
 'toyota\tgas\tstd\ttwo\thatchback\tfwd\tfour\t62\t4800\t35\t39\t5348']

In [425]:
toyotaData=autoData.filter(lambda x: "toyota" in x) # filter()
toyotaData.count()

32

In [427]:
toyotaData.take(5)

['toyota,gas,std,two,hatchback,fwd,four,62,4800,35,39,5348',
 'toyota,gas,std,two,hatchback,fwd,four,62,4800,31,38,6338',
 'toyota,gas,std,four,hatchback,fwd,four,62,4800,31,38,6488',
 'toyota,gas,std,four,wagon,fwd,four,62,4800,31,37,6918',
 'toyota,gas,std,four,sedan,fwd,four,70,4800,30,37,6938']

In [426]:
words=toyotaData.flatMap(lambda line: line.split(",")) #flatMap()
#words.count() #only one action at a time
words.take(20)

['toyota',
 'gas',
 'std',
 'two',
 'hatchback',
 'fwd',
 'four',
 '62',
 '4800',
 '35',
 '39',
 '5348',
 'toyota',
 'gas',
 'std',
 'two',
 'hatchback',
 'fwd',
 'four',
 '62']

In [432]:
collData = sc.parallelize([4,3,8,5,8])

for numbData in collData.distinct().take(5): # print distinct data ,keep one out of duplicates
    print(numbData)

4
8
5
3


In [437]:
collData.reduce(lambda x,y: x+y)

28

In [438]:
autoData.reduce(lambda x,y: x if len(x) < len(y) else y) # find shortest line

'bmw,gas,std,two,sedan,rwd,six,182,5400,16,22,41315'

In [442]:
cylData = autoData.map(lambda x: (x.split(",")[0], x.split(",")[7]))
cylData.take(5)
cylData.keys().take(5) # action #using take() to view result of action

['MAKE', 'subaru', 'chevrolet', 'mazda', 'toyota']

In [444]:
##Remove header row

header = cylData.first()
cylHPData= cylData.filter(lambda line: line != header)
cylHPData.first() # check if first line is header or not

('subaru', '69')

## Find average by Brand of vehicle##

In [455]:
comb_bykey = cylHPData.combineByKey((lambda x: (x,1)),(lambda x,value:(x[0]+value,x[1]+1)),(lambda x,y:(x[0]+y[0],x[1]+y[1])))
comb_bykey.take(5)


[('chevrolet', ('487070', 3)),
 ('mazda', ('6868686868848484841018410110113512072', 16)),
 ('mitsubishi', ('686868881028888116116116145145145', 13)),
 ('nissan', ('696969556969696969699797152152152160160200', 18)),
 ('dodge', ('686868686810288145', 8))]

In [458]:
avg=comb_bykey.mapValues(lambda x:float(x[0])/float(x[1]))
avg.take(5)

[('chevrolet', 162356.66666666666),
 ('mazda', 4.292929293030303e+35),
 ('mitsubishi', 5.283606777145293e+31),
 ('nissan', 3.872053094276094e+40),
 ('dodge', 8.585858585128602e+16)]

In [None]:
#get average using reduce function and a user generated function

def getMPG(autoStr) :
    if isinstance(autoStr, int) : # check if autoStr is integer type ,using isinstance()
        return autoStr
    attList=autoStr.split(",")
    if attList[9].isdigit() : # check if attList[9] is [0-9] ,using isdigit()
        return int(attList[9])
    else:
        return 0

#find average MPG-City for all cars

autoData.reduce(lambda x,y : getMPG(x) + getMPG(y)) / (autoData.count()-1.0)  # suv=btact 1 to not account header

In [463]:
#Using functions for transformation
#cleanse and transform an RDD

def cleanseRDD(autoStr) :
    if isinstance(autoStr, int) :
        return autoStr
    attList=autoStr.split(",")
    
    #convert doors to a number str
    if attList[3] == "two" :
         attList[3]="2"
    else :
         attList[3]="4"
    
    #Convert Drive to uppercase    
    attList[5] = attList[5].upper()
    #return attList # when this func will be caleed in map() then every line will be a list with above transformations 
    
    # to display entire rdd as it will be before applying map() i.e when loaded from textfile with above transformations applied,
    #then use join()
    
    return ",".join(attList)
    
cleanedData=autoData.map(cleanseRDD)
cleanedData.take(5)

['MAKE,FUELTYPE,ASPIRE,4,BODY,DRIVE,CYLINDERS,HP,RPM,MPG-CITY,MPG-HWY,PRICE',
 'subaru,gas,std,2,hatchback,FWD,four,69,4900,31,36,5118',
 'chevrolet,gas,std,2,hatchback,FWD,three,48,5100,47,53,5151',
 'mazda,gas,std,2,hatchback,FWD,four,68,5000,30,31,5195',
 'toyota,gas,std,2,hatchback,FWD,four,62,4800,35,39,5348']

In [436]:
#Set operations on rdd

words1 = sc.parallelize(["hello","war","peace","world"])
words2 = sc.parallelize(["war","peace","universe"])

for u in words1.union(words2).distinct().collect(): #remove duplicates after union()
    print(u)

print("\n")

for i in words1.intersection(words2).collect(): #intersection() results in common elements
    print(i)

hello
universe
peace
world
war


peace
war


## function that splits the line as well as counts sedans and hatchbacks - using accumulator and broadcast##


In [468]:
##Speed optimization##

#Initialize accumulator
sedanCount = sc.accumulator(0)  #sc.accumulator(<initialvalue>)
hatchbackCount =sc.accumulator(0) 

#Set Broadcast variable

sedanText=sc.broadcast("sedan") #sc.broadcast(<broadcastvalue>)
hatchbackText=sc.broadcast("hatchback")


In [469]:
def splitLines(line) :

    global sedanCount # declare accumulator var as global
    global hatchbackCount # declare accumulator var as global

    # check if broadcast var.value i.e 'sedan' is in line which is passed as argument ,
    #if found then increase accu var.value by 1 else it will remain 0
    
    if sedanText.value in line: # if 'sedan' in line: (if broadcast and accumulator concept not used)
        sedanCount +=1 # to get count of 'sedan' in each line passd as argument
        
    if hatchbackText.value in line:
        hatchbackCount +=1 # to get count of 'hatchback' in each line passd as argument
        
    return line.split(",")

In [470]:
#do the map
splitData=autoData.map(splitLines)

splitData.count() # count elements of rdd

print(sedanCount, hatchbackCount) # print accu and broad var set using func splitLines()

92 67


## total amnt per customer id and display smalledt amnt on top##

In [472]:
cust_ord = sc.textFile("file:///D:\\spark-2.3.2-bin-hadoop2.7\\customer-orders.csv")
cust_ord.take(5)

['44,8602,37.19',
 '35,5368,65.89',
 '2,3391,40.64',
 '47,6694,14.98',
 '29,680,13.08']

In [475]:
def parseLine(line):
    fields = line.split(",")
    return (int(fields[0]), int(float(fields[2]) * 100))


In [476]:
c = cust_ord.map(parseLine)
c.take(5)

[(44, 3719), (35, 6589), (2, 4064), (47, 1498), (29, 1308)]

In [477]:
customerOrdersSum = c.reduceByKey(lambda x, y: x + y) # get total amnt per cust id

In [481]:
customerOrdersSorted = customerOrdersSum.map(lambda x: (x[1], x[0])).sortByKey() #sort amnt
customerOrdersSorted.take(5)

[(330937, 45), (379053, 79), (392417, 96), (404260, 23), (417222, 99)]

In [484]:
for i in customerOrdersSorted.take(5):
    print('customer id: '+ str(i[1])+'with amnt: '+str(i[0]/100))

customer id: 45with amnt: 3309.37
customer id: 79with amnt: 3790.53
customer id: 96with amnt: 3924.17
customer id: 23with amnt: 4042.6
customer id: 99with amnt: 4172.22


In [None]:
# RDD can be created from list containing dict,or list or tuple

## Making a Simple DataFrame from a Tuple List##

In [2]:
a_list = [('a', 1), ('b', 2), ('c', 3)]

In [3]:
df = spark.createDataFrame(a_list) # schema not given
df.show()

+---+---+
| _1| _2|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+



In [6]:
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)



In [4]:
df1=spark.createDataFrame(a_list,['let','num']) # schema given (infer schema)
df1.show()

+---+---+
|let|num|
+---+---+
|  a|  1|
|  b|  2|
|  c|  3|
+---+---+



In [5]:
df1.printSchema()

root
 |-- let: string (nullable = true)
 |-- num: long (nullable = true)



## Making a Simple DataFrame from a Dictionary##

In [8]:
a_dict = [{'letters': 'a', 'numbers': 1},
          {'letters': 'b', 'numbers': 2},
          {'letters': 'c', 'numbers': 3}]

In [10]:
df2=spark.createDataFrame(a_dict) # no schema given infer schema) .Warning but still dataframe will be created
df2.show() # by default 20 rows

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



## Making a Simple DataFrame Using a StructType Schema + RDD##

In [15]:
schema=StructType([StructField('letters', StringType(), True),StructField('numbers', IntegerType(), True)])

In [17]:
rdd1=sc.parallelize(a_list)
df1=spark.createDataFrame(rdd1,schema) # schema programatically gieven

In [18]:
df1.show()

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



## Simple Inspection Functions:##

In [19]:
df1.columns

['letters', 'numbers']

In [20]:
df1.dtypes

[('letters', 'string'), ('numbers', 'int')]

In [21]:
df1.schema

StructType(List(StructField(letters,StringType,true),StructField(numbers,IntegerType,true)))

In [22]:
df1.first()

Row(letters='a', numbers=1)

In [25]:
df1.head(5) #default 1 row

[Row(letters='a', numbers=1),
 Row(letters='b', numbers=2),
 Row(letters='c', numbers=3)]

In [26]:
df1.take(5)

[Row(letters='a', numbers=1),
 Row(letters='b', numbers=2),
 Row(letters='c', numbers=3)]

In [27]:
df1.describe().show()

+-------+-------+-------+
|summary|letters|numbers|
+-------+-------+-------+
|  count|      3|      3|
|   mean|   null|    2.0|
| stddev|   null|    1.0|
|    min|      a|      1|
|    max|      c|      3|
+-------+-------+-------+



In [29]:
df1.explain()

== Physical Plan ==
Scan ExistingRDD[letters#40,numbers#41]


In [30]:
display(df1) # displays type of object

DataFrame[letters: string, numbers: int]

In [44]:
df1['letters','numbers'].orderBy(col('letters').desc()).show() # A way to sort column in dataframe without using select()

+-------+-------+
|letters|numbers|
+-------+-------+
|      c|      3|
|      b|      2|
|      a|      1|
+-------+-------+



## Let's use these functions:##

##unionAll()/union(): combine two DataFrames together ##
##orderBy()/sort(): perform sorting of DataFrame columns ##
##select(): select which DataFrame columns to retain ##
##drop(): select a single DataFrame column to remove ##
##filter(): retain DataFrame rows that match a condition ##

In [46]:
df1.unionAll(df1).show() # unionAll() produce duplicates

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



In [52]:
#df1.select('letters','numbers').show()
#df1.select(col('letters'),col('numbers')).show()
#df1.select(df1['letters'],df1['numbers']).show()
df1.select(['letters','numbers']).show()  # selecting muliple columns

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



In [59]:
df1.show()

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



In [55]:
df1.union(df1).show() # union() also produce duplicates

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



In [94]:
df1.union(df1).distinct().show() # remove duplicates

+-------+-------+
|letters|numbers|
+-------+-------+
|      b|      2|
|      a|      1|
|      c|      3|
+-------+-------+



In [95]:
df1.union(df1).dropDuplicates().show() # remove duplicates

+-------+-------+
|letters|numbers|
+-------+-------+
|      b|      2|
|      a|      1|
|      c|      3|
+-------+-------+



In [68]:
df1.orderBy('numbers').show() 

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



In [64]:
df1.orderBy(col('numbers').desc()).show() # using col() & desc()

+-------+-------+
|letters|numbers|
+-------+-------+
|      c|      3|
|      b|      2|
|      a|      1|
+-------+-------+



In [73]:
df1.orderBy('numbers',ascending=False).show()# imp to write 'ascending=' instead of just False

+-------+-------+
|letters|numbers|
+-------+-------+
|      c|      3|
|      b|      2|
|      a|      1|
+-------+-------+



In [70]:
df1.sort(col('numbers')).show() # using sort()

df1.sort(col('numbers').desc()).show() # descending using sort()

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
|      c|      3|
+-------+-------+



In [87]:
df1.drop('letters').show()

+-------+
|numbers|
+-------+
|      1|
|      2|
|      3|
+-------+



In [88]:
# Here is some numeric filtering with comparison operators
# (>, <, >=, <=, ==, != all work)

df1.filter(df1.numbers>1).show()

+-------+-------+
|letters|numbers|
+-------+-------+
|      b|      2|
|      c|      3|
+-------+-------+



In [86]:
df1.filter((df1.numbers>1) & (df1.numbers<3)).show() # multiple conditions in individual brackets

# use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions

+-------+-------+
|letters|numbers|
+-------+-------+
|      b|      2|
+-------+-------+



In [91]:
df1.filter(col('letters').isin(['a', 'b'])).show() # alternate way of filtering data using isin() 

+-------+-------+
|letters|numbers|
+-------+-------+
|      a|      1|
|      b|      2|
+-------+-------+



In [92]:
df1.filter(col('letters').isin(['a', 'b'])==False).show() # to implement is not in.

+-------+-------+
|letters|numbers|
+-------+-------+
|      c|      3|
+-------+-------+



## Using groupBy(): ##
    
#count(): counts the number of records for each group#
#sum(): compute the sum for each numeric column for each group#
#min(): computes the minimum value for each numeric column for each group#
#max(): computes the maximum value for each numeric column for each group#
#avg() or mean(): computes average values for each numeric columns for each group#
#pivot(): pivots a column of the current DataFrame and perform the specified aggregation#

In [116]:
nycflights_schema = StructType([
  StructField('year', IntegerType(), True),
  StructField('month', IntegerType(), True),
  StructField('day', IntegerType(), True),
  StructField('dep_time', StringType(), True),
  StructField('dep_delay', IntegerType(), True),
  StructField('arr_time', StringType(), True),
  StructField('arr_delay', IntegerType(), True),
  StructField('carrier', StringType(), True),
  StructField('tailnum', StringType(), True),
  StructField('flight', StringType(), True),  
  StructField('origin', StringType(), True),
  StructField('dest', StringType(), True),
  StructField('air_time', IntegerType(), True),
  StructField('distance', IntegerType(), True),
  StructField('hour', IntegerType(), True),
  StructField('minute', IntegerType(), True)
  ])

nycflights = \
(spark
 .read
 .format('csv')
 .options(header = True,inferSchema=True) # if inferSchema-=True is not specified then spark will read all column as string type column
 .load('file:///D:\\spark-2.3.2-bin-hadoop2.7\\nycflights13.csv'))

In [117]:
nycflights.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2013|    1|  1|     517|        2|     830|       11|     UA| N14228|  1545|   EWR| IAH|     227|    1400|   5|    17|
|2013|    1|  1|     533|        4|     850|       20|     UA| N24211|  1714|   LGA| IAH|     227|    1416|   5|    33|
|2013|    1|  1|     542|        2|     923|       33|     AA| N619AA|  1141|   JFK| MIA|     160|    1089|   5|    42|
|2013|    1|  1|     544|       -1|    1004|      -18|     B6| N804JB|   725|   JFK| BQN|     183|    1576|   5|    44|
|2013|    1|  1|     554|       -6|     812|      -25|     DL| N668DN|   461|   LGA| ATL|     116|     762|   5|    54|
|2013|    1|  1|     554|       -4|     

In [118]:
nycflights.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- dep_time: integer (nullable = true)
 |-- dep_delay: integer (nullable = true)
 |-- arr_time: integer (nullable = true)
 |-- arr_delay: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: integer (nullable = true)
 |-- distance: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- minute: integer (nullable = true)



In [119]:
display(nycflights)

DataFrame[year: int, month: int, day: int, dep_time: int, dep_delay: int, arr_time: int, arr_delay: int, carrier: string, tailnum: string, flight: int, origin: string, dest: string, air_time: int, distance: int, hour: int, minute: int]

In [121]:
nycflights.groupby('month').count().show() # creates a new column with aggregate `count` values and groupBy Column


+-----+-----+
|month|count|
+-----+-----+
|   12|28135|
|    1|27004|
|    6|28243|
|    3|28834|
|    5|28796|
|    9|27574|
|    4|28330|
|    8|29327|
|    7|29425|
|   10|28889|
|   11|27268|
|    2|24951|
+-----+-----+



In [124]:
nycflights.groupby('month').agg({'dep_delay': 'avg', 'arr_delay': 'max'}).show() # multiple aggreagtion func on diff columns

+-----+--------------+------------------+
|month|max(arr_delay)|    avg(dep_delay)|
+-----+--------------+------------------+
|   12|           878|16.576687569162672|
|    1|          1272|10.036665030396858|
|    6|          1127|20.846331791143424|
|    3|           915|13.227076109105209|
|    5|           875|12.986859348988771|
|    9|          1007|6.7224762185679525|
|    4|           931|13.938037741305763|
|    8|           490|12.611039839117922|
|    7|           989|21.727786554326837|
|   10|           688| 6.243988413080655|
|   11|           796|  5.43536156833734|
|    2|           834|10.816842549598986|
+-----+--------------+------------------+



In [131]:
nycflights.groupby('month', 'origin', 'dest').count().orderBy('month', 'count',ascending = [1, 0]).show(10)

#nycflights.groupby(['month', 'origin', 'dest']).count().orderBy(['month', 'count'],ascending = [1, 0]).show(10)

# group by on multiple columns                          
# perform a 'count' aggregation on the groups
#orderBY on multiple col with diff sorting order for each col
#ascending=[1,0] means ascending is true for 'month' col and false(i.e descending) for 'count' col.

           

+-----+------+----+-----+
|month|origin|dest|count|
+-----+------+----+-----+
|    1|   JFK| LAX|  937|
|    1|   LGA| ATL|  878|
|    1|   JFK| SFO|  671|
|    1|   LGA| ORD|  583|
|    1|   EWR| ORD|  502|
|    1|   JFK| BOS|  486|
|    1|   JFK| MCO|  456|
|    1|   LGA| MIA|  451|
|    1|   JFK| FLL|  439|
|    1|   LGA| DFW|  437|
+-----+------+----+-----+
only showing top 10 rows



In [132]:
display(
  nycflights
  .groupBy('month')
  .count()
)

DataFrame[month: int, count: bigint]

In [136]:
x=nycflights.groupBy('carrier').pivot('origin').avg('dep_delay') # groupBy() with pivot() and aggregation.

x.show()

# pivot(col name)- will produce pivot col as oe col grouped values as one column and pivot unique column values 
# null will be the value fof pivot col if aggregation can't be done
#as different columns having calculate aggregation accordingly

+-------+------------------+------------------+------------------+
|carrier|               EWR|               JFK|               LGA|
+-------+------------------+------------------+------------------+
|     UA| 12.52286865854727|               7.9|12.087916294500447|
|     AA|10.035419126328216|10.302155109221522| 6.705769103100312|
|     EV| 20.16493117893477|18.520361990950228| 19.12549969715324|
|     B6|13.100262224278882|12.757453126122458|14.805738396624472|
|     DL|12.084592145015106| 8.333187709334497|  9.57299733123332|
|     OO|20.833333333333332|              null|10.434782608695652|
|     F9|              null|              null|20.215542521994134|
|     YV|              null|              null|18.996330275229358|
|     US| 3.735103926096998| 5.866958571909734|3.3065054875139177|
|     MQ|17.467267552182165|13.199970870958346| 8.528568781271234|
|     HA|              null| 4.900584795321637|              null|
|     AS| 5.804775280898877|              null|              n

## Column Operations ##

format_number(): apply formatting to a number, rounded to d decimal places, and return the result as a string
when() & otherwise(): when() evaluates a list of conditions and returns one of multiple possible result expressions; if otherwise() is not invoked, None is returned for unmatched conditions
concat_ws(): concatenates multiple input string columns together into a single string column, using the given separator
to_utc_timestamp(): assumes the given timestamp is in given timezone and converts to UTC
year(): extracts the year of a given date as integer
month(): extracts the month of a given date as integer
dayofmonth(): extracts the day of the month of a given date as integer
hour(): extract the hour of a given date as integer
minute(): extract the minute of a given date as integer