In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[1]") \
    .appName("SparkByExamples.com").getOrCreate()

In [3]:
data = ["Project","Gutenberg’s","Alice’s","Adventures",
"in","Wonderland","Project","Gutenberg’s","Adventures",
"in","Wonderland","Project","Gutenberg’s"]

In [4]:
rdd=spark.sparkContext.parallelize(data)

In [5]:
rdd2=rdd.map(lambda x: (x,1))
for element in rdd2.collect():
    print(element)

('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)


In [6]:
data = [('James','Smith','M',30),
  ('Anna','Rose','F',41),
  ('Robert','Williams','M',62), 
]


In [7]:
columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|    30|
|     Anna|    Rose|     F|    41|
|   Robert|Williams|     M|    62|
+---------+--------+------+------+



In [8]:
rdd2=df.rdd.map(lambda x: 
    (x[0]+","+x[1],x[2],x[3]*2)
    )  
df2=rdd2.toDF(["name","gender","new_salary"]   )
df2.show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    James,Smith|     M|        60|
|      Anna,Rose|     F|        82|
|Robert,Williams|     M|       124|
+---------------+------+----------+



In [9]:
# By Calling function
def func1(x):
    firstName=x.firstname
    lastName=x.lastname
    name=firstName+","+lastName
    gender=x.gender.lower()
    salary=x.salary*2
    return (name,gender,salary)

In [10]:
# Apply the func1 function using lambda
rdd2 = df.rdd.map(lambda x: func1(x))

In [11]:
#or
# Apply the func1 function to each element of the RDD using map()
rdd2 = df.rdd.map(func1)

In [12]:
for element in rdd2.collect():
    print(element)

('James,Smith', 'm', 60)
('Anna,Rose', 'f', 82)
('Robert,Williams', 'm', 124)


In [13]:
# Create DataFrames for Employees and Departments
data_employees = [(1, "John", 1), (2, "Emma", 2), (3, "Raj", None), (4, "Nina", 4)]
data_departments = [(1, "HR"), (2, "Tech"), (3, "Marketing"), (None, "Temp")]

columns_employees = ["emp_id", "emp_name", "dept_id"]
columns_departments = ["dept_id", "dept_name"]

df_employees = spark.createDataFrame(data_employees, columns_employees)
df_departments = spark.createDataFrame(data_departments, columns_departments)

In [14]:
# Perform INNER JOIN
# since `inner` is the default join type, we can omit it
df_joined = df_employees.join(df_departments, df_employees.dept_id == df_departments.dept_id)

In [15]:
# Show the result
df_joined.show()

+------+--------+-------+-------+---------+
|emp_id|emp_name|dept_id|dept_id|dept_name|
+------+--------+-------+-------+---------+
|     1|    John|      1|      1|       HR|
|     2|    Emma|      2|      2|     Tech|
+------+--------+-------+-------+---------+



In [16]:
df_cross_joined = df_employees.crossJoin(df_departments)
df_cross_joined.show()

+------+--------+-------+-------+---------+
|emp_id|emp_name|dept_id|dept_id|dept_name|
+------+--------+-------+-------+---------+
|     1|    John|      1|      1|       HR|
|     1|    John|      1|      2|     Tech|
|     1|    John|      1|      3|Marketing|
|     1|    John|      1|   NULL|     Temp|
|     2|    Emma|      2|      1|       HR|
|     2|    Emma|      2|      2|     Tech|
|     2|    Emma|      2|      3|Marketing|
|     2|    Emma|      2|   NULL|     Temp|
|     3|     Raj|   NULL|      1|       HR|
|     3|     Raj|   NULL|      2|     Tech|
|     3|     Raj|   NULL|      3|Marketing|
|     3|     Raj|   NULL|   NULL|     Temp|
|     4|    Nina|      4|      1|       HR|
|     4|    Nina|      4|      2|     Tech|
|     4|    Nina|      4|      3|Marketing|
|     4|    Nina|      4|   NULL|     Temp|
+------+--------+-------+-------+---------+



In [17]:
df_leftjoined = df_employees.join(df_departments, df_employees.dept_id == df_departments.dept_id, "left")


In [18]:
# Show the result
df_leftjoined.show()

+------+--------+-------+-------+---------+
|emp_id|emp_name|dept_id|dept_id|dept_name|
+------+--------+-------+-------+---------+
|     3|     Raj|   NULL|   NULL|     NULL|
|     1|    John|      1|      1|       HR|
|     2|    Emma|      2|      2|     Tech|
|     4|    Nina|      4|   NULL|     NULL|
+------+--------+-------+-------+---------+



In [19]:
df_rightjoined = df_employees.join(df_departments, df_employees.dept_id == df_departments.dept_id, "right")


In [20]:
# Show the result
df_rightjoined.show()

+------+--------+-------+-------+---------+
|emp_id|emp_name|dept_id|dept_id|dept_name|
+------+--------+-------+-------+---------+
|  NULL|    NULL|   NULL|   NULL|     Temp|
|     1|    John|      1|      1|       HR|
|  NULL|    NULL|   NULL|      3|Marketing|
|     2|    Emma|      2|      2|     Tech|
+------+--------+-------+-------+---------+



In [21]:
# Create DataFrames for Users and Purchases
data_users = [(1, "Alice"), (2, "Bob"), (3, "Charlie"), (4, "David")]
data_purchases = [(1, "Book"), (2, "Pen"), (5, "Notebook")]

columns_users = ["id", "name"]
columns_purchases = ["user_id", "item"]

df_users = spark.createDataFrame(data_users, columns_users)
df_purchases = spark.createDataFrame(data_purchases, columns_purchases)

In [22]:
# Perform Left Semi Join
df_purchasers = df_users.join(df_purchases, df_users.id == df_purchases.user_id, "left_semi")


In [23]:
# Show the result
df_purchasers.show()

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



In [24]:
# Perform Left anti Join
df_purchasers = df_users.join(df_purchases, df_users.id == df_purchases.user_id, "left_anti")


In [25]:
# Show the result
df_purchasers.show()


+---+-------+
| id|   name|
+---+-------+
|  3|Charlie|
|  4|  David|
+---+-------+



In [None]:
df = spark.read.option("header",True) \
    .csv("Zipcodes.csv").createOrReplaceTempView("Zipcodes")


In [None]:
spark.sql("SELECT country, city, zipcode, state FROM ZIPCODES") \
     .show(5)

In [None]:
spark.sql(""" SELECT  country, city, zipcode, state FROM ZIPCODES 
          WHERE state = 'AZ' """) \
     .show(5)

In [None]:
spark.sql(""" SELECT  country, city, zipcode, state FROM ZIPCODES 
          WHERE state in ('PR','AZ','FL') order by state """) \
     .show(10)

In [None]:
spark.sql(""" SELECT state, count(*) as count FROM ZIPCODES 
          GROUP BY state""") \
     .show()


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ECommerceAnalysis').getOrCreate()
df = spark.read.csv("/FileStore/sample_ecommerce_data.csv", header=True, inferSchema=True)
df.show()