In [1]:
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
os.chdir("/Users/pepijnschouten/Desktop/Python_Scripts/Python_Scripts_Books" \
         "/Distributed_ML_with_PySpark/Python_Own_Files/Chapter 1")

"""
Creating dataframes
"""
# create pandas dataframe
data = {"Country": ["United States", "Brazil", "Russia"],
    "River": ["Mississippi", "Amazon", "Volga"]}
pandas_df = pd.DataFrame(data)
print(pandas_df)

# mitigate to pyspark
spark = SparkSession.builder.appName("BigRivers").getOrCreate()
pyspark_df = spark.createDataFrame(pandas_df)
pyspark_df.show()

# from spark to pandas
spark = SparkSession.builder.appName("BigRivers").getOrCreate()
data = [("United States", "Mississippi"),
         ("Brazil", "Amazon"),
         ("Russia", "Volga")]
spark_df = spark.createDataFrame(data, ["Country", "River"])
pandas_df = spark_df.toPandas()
print(pandas_df)

"""
loading data
"""
data_path = os.path.join("data",
                         "data.csv")

# oandas
pandas_df = pd.read_csv(data_path)
print(pandas_df)

# pyspark
spark = SparkSession.builder.appName("BigRivers").getOrCreate()
pyspark_df = spark.read.csv(data_path, header=True, inferSchema=True)
pyspark_df.show()

"""
Selecting columns
"""
# pandas
pandas_df = pd.DataFrame({"President": 
                            ["George Washington",
                            "Abraham Lincoln",
                            "Franklin D. Roosevelt",
                            "John F. Kennedy",
                            "Barack Obama"],
                            "Year of Office":
                            [1789, 1861, 1933, 1961, 2009]})

print(pandas_df[["President", "Year of Office"]])
print(pandas_df.filter(items=["President", "Year of Office"]))

# pyspark
spark = SparkSession.builder.appName("Presidents").getOrCreate()
spark_df = spark.createDataFrame([
    ("George Washington", 1789),
    ("Abraham Lincoln", 1861),
    ("Franklin D. Roosevelt", 1933),
    ("John F. Kennedy", 1961),
    ("Barack Obama", 2009)],
    ["President", "Year of Office"])

spark_df[["President", "Year of Office"]].show()
spark_df.select("President", "Year of Office").show()


"""
Aggregating data
"""
# pandas
pandas_df = pd.DataFrame({
    "movie": ["Avengers", "Frozen", "Star Wars",
    "The Lion King", "Harry Potter"],
    "revenue": [90000000, 70000000, 110000000, 80000000, 100000000]})
print(pandas_df.groupby('movie').agg({"revenue": "mean"}))

# pyspark
spark_df = spark.createDataFrame([
    ("Avengers", 90000000),
    ("Frozen", 70000000),
    ("Star Wars", 110000000),
    ("The Lion King", 80000000),
    ("Harry Potter", 100000000)],
    ["movie", "revenue"])
spark_df.groupby("movie").agg({"revenue": "mean"}).show()

"""
Filtering data
"""
# pandas
pandas_df = pd.DataFrame({
    "animal": ["Lion", "Elephant", "Tiger", "Gorilla"],
    "muscle_power": [400, 6000, 350, 800]})
print(pandas_df[pandas_df["muscle_power"] > 350])
print(pandas_df.query("muscle_power > 350"))

# pyspark
spark = SparkSession.builder.appName("FilterByMusclePower").getOrCreate()
spark_df = spark.createDataFrame([
    ("Lion", 400),
    ("Elephant", 6000),
    ("Tiger", 350),
    ("Gorilla", 800)],
    ["animal", "muscle_power"])
spark_df[spark_df["muscle_power"] > 350].show()
spark_df.filter("muscle_power > 350").show()
spark_df.where("muscle_power > 350").show()
spark_df.filter(F.col("muscle_power") > 350).show()


"""
Joining data
"""
# pandas
pandas_df = pd.DataFrame({
    "country": ["United States", "Brazil", "Russia"],
    "river": ["Mississippi", "Amazon", "Volga"]})
pandas_df2 = pd.DataFrame({
    "country": ["United States", "Brazil", "Netherlands"],
    "capital": ["Washington", "Brasilia", "Amsterdam"]})
print(pandas_df.merge(pandas_df2, on="country", how="inner"))

# pyspark
spark_df = spark.createDataFrame([
    ("United States", "Mississippi"),
    ("Brazil", "Amazon"),
    ("Russia", "Volga")],
    ["country", "river"])
spark_df2 = spark.createDataFrame([
    ("United States", "Washington"),
    ("Brazil", "Brasilia"),
    ("Netherlands", "Amsterdam")],
    ["country", "capital"])
spark_df.join(spark_df2, on="country", how="inner").show()


"""
Saving data
"""
# pandas
pandas_df.to_csv(os.path.join("output","pandas_output.csv"),
                              index=False)

# pyspark
spark_df.coalesce(1).write.csv(os.path.join("output","spark_output"),
                               header=True, mode="overwrite")


         Country        River
0  United States  Mississippi
1         Brazil       Amazon
2         Russia        Volga


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/24 09:15:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+-------------+-----------+
|      Country|      River|
+-------------+-----------+
|United States|Mississippi|
|       Brazil|     Amazon|
|       Russia|      Volga|
+-------------+-----------+

         Country        River
0  United States  Mississippi
1         Brazil       Amazon
2         Russia        Volga
   Index        Country        River
0      0  United States  Mississippi
1      1         Brazil       Amazon
2      2         Russia        Volga
+-----+-------------+-----------+
|Index|      Country|      River|
+-----+-------------+-----------+
|    0|United States|Mississippi|
|    1|       Brazil|     Amazon|
|    2|       Russia|      Volga|
+-----+-------------+-----------+

               President  Year of Office
0      George Washington            1789
1        Abraham Lincoln            1861
2  Franklin D. Roosevelt            1933
3        John F. Kennedy            1961
4           Barack Obama            2009
               President  Year of Office
0      Ge

24/10/24 09:15:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+--------------------+--------------+
|           President|Year of Office|
+--------------------+--------------+
|   George Washington|          1789|
|     Abraham Lincoln|          1861|
|Franklin D. Roose...|          1933|
|     John F. Kennedy|          1961|
|        Barack Obama|          2009|
+--------------------+--------------+

+--------------------+--------------+
|           President|Year of Office|
+--------------------+--------------+
|   George Washington|          1789|
|     Abraham Lincoln|          1861|
|Franklin D. Roose...|          1933|
|     John F. Kennedy|          1961|
|        Barack Obama|          2009|
+--------------------+--------------+

                   revenue
movie                     
Avengers        90000000.0
Frozen          70000000.0
Harry Potter   100000000.0
Star Wars      110000000.0
The Lion King   80000000.0
+-------------+------------+
|        movie|avg(revenue)|
+-------------+------------+
|     Avengers|       9.0E7|
|       F

24/10/24 09:15:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+--------+------------+
|  animal|muscle_power|
+--------+------------+
|    Lion|         400|
|Elephant|        6000|
| Gorilla|         800|
+--------+------------+

+--------+------------+
|  animal|muscle_power|
+--------+------------+
|    Lion|         400|
|Elephant|        6000|
| Gorilla|         800|
+--------+------------+

+--------+------------+
|  animal|muscle_power|
+--------+------------+
|    Lion|         400|
|Elephant|        6000|
| Gorilla|         800|
+--------+------------+

+--------+------------+
|  animal|muscle_power|
+--------+------------+
|    Lion|         400|
|Elephant|        6000|
| Gorilla|         800|
+--------+------------+

         country        river     capital
0  United States  Mississippi  Washington
1         Brazil       Amazon    Brasilia
+-------------+-----------+----------+
|      country|      river|   capital|
+-------------+-----------+----------+
|       Brazil|     Amazon|  Brasilia|
|United States|Mississippi|Washington|
+--

24/10/24 11:19:05 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 966837 ms exceeds timeout 120000 ms
24/10/24 11:19:05 WARN SparkContext: Killing executors is not supported by current scheduler.
24/10/24 11:19:06 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$