In [18]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


In [2]:
spark = SparkSession.builder.master("local[*]").appName("Dataframe - withColumn").getOrCreate()

In [3]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x7f3198886c10>


In [4]:
spark

In [5]:
df = (
    spark.read
    .format("csv")
    .option("header", "true")
    .option("delimiter", ",")
    .option("infeSchema", "true")
    .load("csv/AnnualTicketSales.csv")
     )

In [6]:
df.show()

+----+--------------+----------------+-----------------------------------+--------------------+----+
|YEAR|  TICKETS SOLD|TOTAL BOX OFFICE|TOTAL INFLATION ADJUSTED BOX OFFICE|AVERAGE TICKET PRICE| _c5|
+----+--------------+----------------+-----------------------------------+--------------------+----+
|2021|  42,37,74,881|  $3,881,777,912|                     $3,881,777,912|               $9.16|null|
|2020|  22,36,38,958|  $2,048,534,616|                     $2,048,534,616|               $9.16|null|
|2019|1,22,85,41,629| $11,253,443,955|                    $11,253,444,050|               $9.16|null|
|2018|1,31,15,36,128| $11,948,096,650|                    $12,013,670,952|               $9.11|null|
|2017|1,22,56,39,761| $10,993,991,460|                    $11,226,860,216|               $8.97|null|
|2016|1,30,25,56,378| $11,267,115,924|                    $11,931,416,424|               $8.65|null|
|2015|1,32,33,56,776| $11,155,900,636|                    $12,121,948,075|               $8

22/01/07 19:16:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: YEAR, TICKETS SOLD, TOTAL BOX OFFICE, TOTAL INFLATION ADJUSTED BOX OFFICE, AVERAGE TICKET PRICE, 
 Schema: YEAR, TICKETS SOLD, TOTAL BOX OFFICE, TOTAL INFLATION ADJUSTED BOX OFFICE, AVERAGE TICKET PRICE, _c5
Expected: _c5 but found: 
CSV file: file:///home/ricardo/Documentos/Spark/csv/AnnualTicketSales.csv


In [7]:
df = df.drop("_c5")

In [8]:
df.show()

+----+--------------+----------------+-----------------------------------+--------------------+
|YEAR|  TICKETS SOLD|TOTAL BOX OFFICE|TOTAL INFLATION ADJUSTED BOX OFFICE|AVERAGE TICKET PRICE|
+----+--------------+----------------+-----------------------------------+--------------------+
|2021|  42,37,74,881|  $3,881,777,912|                     $3,881,777,912|               $9.16|
|2020|  22,36,38,958|  $2,048,534,616|                     $2,048,534,616|               $9.16|
|2019|1,22,85,41,629| $11,253,443,955|                    $11,253,444,050|               $9.16|
|2018|1,31,15,36,128| $11,948,096,650|                    $12,013,670,952|               $9.11|
|2017|1,22,56,39,761| $10,993,991,460|                    $11,226,860,216|               $8.97|
|2016|1,30,25,56,378| $11,267,115,924|                    $11,931,416,424|               $8.65|
|2015|1,32,33,56,776| $11,155,900,636|                    $12,121,948,075|               $8.43|
|2014|1,25,74,02,920| $10,272,985,008|  

In [9]:
df = df.withColumn("TICKETS SOLD", F.regexp_replace("TICKETS SOLD", "\,", ""))

In [10]:
df = df.withColumn("TOTAL BOX OFFICE", F.regexp_replace("TOTAL BOX OFFICE", "\$", ""))

In [11]:
df = df.withColumn("TOTAL BOX OFFICE", F.regexp_replace("TOTAL BOX OFFICE", "\,", ""))

In [12]:
df = df.withColumn("TOTAL INFLATION ADJUSTED BOX OFFICE", F.regexp_replace("TOTAL INFLATION ADJUSTED BOX OFFICE", "\$", ""))

In [13]:
df = df.withColumn("TOTAL INFLATION ADJUSTED BOX OFFICE", F.regexp_replace("TOTAL INFLATION ADJUSTED BOX OFFICE", "\,", ""))

In [14]:
df = df.withColumn("AVERAGE TICKET PRICE", F.regexp_replace("AVERAGE TICKET PRICE", "\$", ""))

In [15]:
df = df.select(df["YEAR"].alias("Ano"),
            df["TICKETS SOLD"].cast("float").alias("Tickets_Vendidos"),
            df["TOTAL BOX OFFICE"].cast("float").alias("Total_deCaixa"),
            df["TOTAL INFLATION ADJUSTED BOX OFFICE"].cast("float").alias("Total_deCaixa_Ajust_Inflacao"),
            df["AVERAGE TICKET PRICE"].cast("float").alias("Preco_Medio_Ticket") )
    

In [20]:
df.printSchema()

root
 |-- Ano: string (nullable = true)
 |-- Tickets_Vendidos: float (nullable = true)
 |-- Total_deCaixa: float (nullable = true)
 |-- Total_deCaixa_Ajust_Inflacao: float (nullable = true)
 |-- Preco_Medio_Ticket: float (nullable = true)



In [93]:
df.select(F.col("Total_deCaixa"),F.col("Total_deCaixa_Ajust_Inflacao")).filter(F.col("Total_deCaixa") > "1.E8").show(10)

+-------------+----------------------------+
|Total_deCaixa|Total_deCaixa_Ajust_Inflacao|
+-------------+----------------------------+
| 3.88177792E9|                3.88177792E9|
| 2.04853466E9|                2.04853466E9|
|1.12534436E10|               1.12534436E10|
|1.19480965E10|               1.20136714E10|
|1.09939917E10|               1.12268605E10|
| 1.1267116E10|               1.19314166E10|
|1.11559004E10|               1.21219482E10|
|1.02729851E10|               1.15178107E10|
|1.08874465E10|               1.22667878E10|
|1.09921413E10|               1.26492447E10|
+-------------+----------------------------+
only showing top 10 rows



In [104]:
df2 = df.withColumn("diferenca_inflacao", (F.col("Total_deCaixa_Ajust_Inflacao")) - (F.col("Total_deCaixa")))

In [105]:
df2.show()

+----+----------------+-------------+----------------------------+------------------+------------------+
| Ano|Tickets_Vendidos|Total_deCaixa|Total_deCaixa_Ajust_Inflacao|Preco_Medio_Ticket|diferenca_inflacao|
+----+----------------+-------------+----------------------------+------------------+------------------+
|2021|     4.2377488E8| 3.88177792E9|                3.88177792E9|              9.16|               0.0|
|2020|     2.2363896E8| 2.04853466E9|                2.04853466E9|              9.16|               0.0|
|2019|    1.22854157E9|1.12534436E10|               1.12534436E10|              9.16|               0.0|
|2018|    1.31153613E9|1.19480965E10|               1.20136714E10|              9.11|       6.5574912E7|
|2017|    1.22563981E9|1.09939917E10|               1.12268605E10|              8.97|      2.32868864E8|
|2016|    1.30255642E9| 1.1267116E10|               1.19314166E10|              8.65|       6.6430054E8|
|2015|     1.3233568E9|1.11559004E10|               1.2