In [1]:
import findspark

findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.accumulators import AccumulatorParam

In [4]:
class VectorAccumulatorParam(AccumulatorParam):
    def zero(self, value): #initialize the accumulator when first created
        return [0.0]*len(value)
    def addInPlace(self,v1,v2): #add to the current accumulated vector values
        for i in range(len(v1)):
            v1[i]+=v2[i]
        return v1

In [9]:
from pyspark.sql.types import Row
from datetime import datetime

In [10]:
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

df = spark.sql("select 'spark' as hello ")

df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [11]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExample.com").getOrCreate()
address = [(1,"15851 Jeffrey Rd", "Alice"),
          (2,"Jeffrey Rd", "Bob"),
          (3,"1311 Ave", "CA")]
df = spark.createDataFrame(address,["id","address","Name"])
df.show()

+---+----------------+-----+
| id|         address| Name|
+---+----------------+-----+
|  1|15851 Jeffrey Rd|Alice|
|  2|      Jeffrey Rd|  Bob|
|  3|        1311 Ave|   CA|
+---+----------------+-----+



In [12]:
from pyspark import *
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as f
# from pysaprk.sql.functions
spark = SparkSession.builder.appName('pyspark - example read csv').getOrCreate()
sc  = spark.sparkContext
df = spark.read.option("header",True).csv(r"C:\Users\sanedunu\Desktop\demo.csv")
df.show()

+---+-------+------+
| id|   name|source|
+---+-------+------+
|  1|  Alice|  ngdb|
|  2|    bod|   jhg|
|  3|    sgf|   kjy|
| 45|   drfv|   jgf|
|  5|    hsr|   gfd|
|  6|    hfb|    fd|
|  7| hdfbgc|     h|
|  8|    jdg|   the|
|  9|jjegfdb|    hy|
| 10|   jdcg|   hfd|
+---+-------+------+



In [13]:
sc

In [14]:
vector_accum=sc.accumulator([10.0,20.0,30.0],VectorAccumulatorParam())
vector_accum.value

[10.0, 20.0, 30.0]

In [15]:
vector_accum+=[1,2,3]
vector_accum.value

[11.0, 22.0, 33.0]

In [17]:
#working on joins
valuesA=[('John',100000),('James',150000),('Emily',65000),('Nina',200000)]
tableA=spark.createDataFrame(valuesA,['name','salary'])

In [18]:
tableA.show()

+-----+------+
| name|salary|
+-----+------+
| John|100000|
|James|150000|
|Emily| 65000|
| Nina|200000|
+-----+------+



In [19]:
valuesB=[('James',2),('Emily',3),('Darth Vader',5),('Princess Leia',6)]
tableB=spark.createDataFrame(valuesB,['name','employee_id'])

In [20]:
tableB.show()

+-------------+-----------+
|         name|employee_id|
+-------------+-----------+
|        James|          2|
|        Emily|          3|
|  Darth Vader|          5|
|Princess Leia|          6|
+-------------+-----------+



In [22]:
inner_join=tableA.join(tableB,tableA.name==tableB.name)
inner_join.show()

+-----+------+-----+-----------+
| name|salary| name|employee_id|
+-----+------+-----+-----------+
|Emily| 65000|Emily|          3|
|James|150000|James|          2|
+-----+------+-----+-----------+



In [26]:
left_join=tableA.join(tableB,tableA.name==tableB.name,how='left')
left_join.show()

+-----+------+-----+-----------+
| name|salary| name|employee_id|
+-----+------+-----+-----------+
|Emily| 65000|Emily|          3|
|James|150000|James|          2|
| John|100000| null|       null|
| Nina|200000| null|       null|
+-----+------+-----+-----------+



In [27]:
right_join=tableA.join(tableB,tableA.name==tableB.name,how='right')
right_join.show()

+-----+------+-------------+-----------+
| name|salary|         name|employee_id|
+-----+------+-------------+-----------+
| null|  null|  Darth Vader|          5|
|Emily| 65000|        Emily|          3|
|James|150000|        James|          2|
| null|  null|Princess Leia|          6|
+-----+------+-------------+-----------+



In [28]:
full_outer_join=tableA.join(tableB,tableA.name==tableB.name,how='full')
full_outer_join.show()

+-----+------+-------------+-----------+
| name|salary|         name|employee_id|
+-----+------+-------------+-----------+
| null|  null|  Darth Vader|          5|
|Emily| 65000|        Emily|          3|
|James|150000|        James|          2|
| John|100000|         null|       null|
| Nina|200000|         null|       null|
| null|  null|Princess Leia|          6|
+-----+------+-------------+-----------+

