In [59]:
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("PySpark_Structured_API").\
config("spark.driver.memory","4g").\
config("spark.executor.memory","4g").\
getOrCreate()


In [3]:
path = "./fire-incidents.csv"

In [4]:
df = spark.read.csv(path,inferSchema=True,header=True)

In [5]:
df.select(df.columns[0:5]).show(5,False)

+--------------+--------------+---------+------------------+-----------------------+
|IncidentNumber|ExposureNumber|ID       |Address           |IncidentDate           |
+--------------+--------------+---------+------------------+-----------------------+
|20104668      |0             |201046680|MARIPOSA STREET   |2020-09-11T00:00:00.000|
|20104708      |0             |201047080|355 27TH STREET   |2020-09-11T00:00:00.000|
|20104648      |0             |201046480|2048 POLK STREET  |2020-09-10T00:00:00.000|
|20104598      |0             |201045980|501 COLLEGE AVENUE|2020-09-10T00:00:00.000|
|20104575      |0             |201045750|289 9TH AVENUE    |2020-09-10T00:00:00.000|
+--------------+--------------+---------+------------------+-----------------------+
only showing top 5 rows



In [6]:
df.createOrReplaceTempView("TempView")

In [7]:
df2 = spark.sql("select IncidentNumber, sum(ExposureNumber) as SUM, ExposureNumber from TempView group by ExposureNumber, IncidentNumber")

In [8]:
df2.show(25)

+--------------+---+--------------+
|IncidentNumber|SUM|ExposureNumber|
+--------------+---+--------------+
|      20102931|0.0|             0|
|      20102077|0.0|             0|
|      20100748|0.0|             0|
|      20098068|0.0|             0|
|      20095954|0.0|             0|
|      20094633|0.0|             0|
|      20093944|0.0|             0|
|      20093817|0.0|             0|
|      20091571|0.0|             0|
|      20091656|0.0|             0|
|      20089559|0.0|             0|
|      20087340|0.0|             0|
|      20086284|0.0|             0|
|      20066138|0.0|             0|
|      20065892|0.0|             0|
|      20066047|0.0|             0|
|      20063273|0.0|             0|
|      20062905|0.0|             0|
|      20062468|0.0|             0|
|      20062304|0.0|             0|
|      20062035|0.0|             0|
|      20060331|0.0|             0|
|      20060197|0.0|             0|
|      20058625|0.0|             0|
|      20057532|0.0|        

In [9]:
from pyspark.sql.types import *

data = [('Ram',1),('Sai',2),('Karthik',3),('Ram',1),('Ram',2),('Sai',2),('Karthik', 2),('Sai',3)]

dataTypes = StructType([StructField("Name", StringType(), True),
                      StructField("Id", IntegerType(), True)])

In [10]:
sparkDf = spark.createDataFrame(data, dataTypes)

In [11]:
sparkDf.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Id: integer (nullable = true)



In [12]:
sparkDf.show()

+-------+---+
|   Name| Id|
+-------+---+
|    Ram|  1|
|    Sai|  2|
|Karthik|  3|
|    Ram|  1|
|    Ram|  2|
|    Sai|  2|
|Karthik|  2|
|    Sai|  3|
+-------+---+



In [13]:
sparkDf.createOrReplaceTempView("TemporaryView")

In [14]:
spark.sql("select Name, Id, Rank() over(order by Name, Id) as RNK from TemporaryView").show()

+-------+---+---+
|   Name| Id|RNK|
+-------+---+---+
|Karthik|  2|  1|
|Karthik|  3|  2|
|    Ram|  1|  3|
|    Ram|  1|  3|
|    Ram|  2|  5|
|    Sai|  2|  6|
|    Sai|  2|  6|
|    Sai|  3|  8|
+-------+---+---+



In [15]:
rank_functions_df = spark.sql("select Name, Id, ROW_NUMBER() OVER(order by Name, Id) as RN, Rank()\
over(order by Name, Id) as RNK, DENSE_RANK() OVER(order by Name, Id) as DN from TemporaryView")

In [16]:
rank_functions_df.show()

+-------+---+---+---+---+
|   Name| Id| RN|RNK| DN|
+-------+---+---+---+---+
|Karthik|  2|  1|  1|  1|
|Karthik|  3|  2|  2|  2|
|    Ram|  1|  3|  3|  3|
|    Ram|  1|  4|  3|  3|
|    Ram|  2|  5|  5|  4|
|    Sai|  2|  6|  6|  5|
|    Sai|  2|  7|  6|  5|
|    Sai|  3|  8|  8|  6|
+-------+---+---+---+---+



In [17]:
rank_functions_df.dropDuplicates(["Name","Id"]).show()

+-------+---+---+---+---+
|   Name| Id| RN|RNK| DN|
+-------+---+---+---+---+
|Karthik|  2|  1|  1|  1|
|Karthik|  3|  2|  2|  2|
|    Ram|  1|  3|  3|  3|
|    Ram|  2|  5|  5|  4|
|    Sai|  2|  6|  6|  5|
|    Sai|  3|  8|  8|  6|
+-------+---+---+---+---+



In [18]:
from pyspark.sql.functions import expr

# Search and Filter DataFrames HW

In [19]:
from pyspark.sql.functions import *
from pyspark.sql import *

In [24]:
filePath = "C:\Learning\Python_Projects\PySpark\\fifa19.csv"

In [25]:
dataFrame = spark.read.csv(filePath, inferSchema=True, header=True)

In [26]:
dataFrame.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Photo: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- Overall: integer (nullable = true)
 |-- Potential: integer (nullable = true)
 |-- Club: string (nullable = true)
 |-- Club Logo: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Wage: string (nullable = true)
 |-- Special: integer (nullable = true)
 |-- Preferred Foot: string (nullable = true)
 |-- International Reputation: integer (nullable = true)
 |-- Weak Foot: integer (nullable = true)
 |-- Skill Moves: integer (nullable = true)
 |-- Work Rate: string (nullable = true)
 |-- Body Type: string (nullable = true)
 |-- Real Face: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Jersey Number: integer (nullable = true)
 |-- Joined: string (nullable = true)
 |-- Loaned From: string (nu

In [27]:
dataFrame.show(5,False)

+---+------+-----------------+---+----------------------------------------------+-----------+-----------------------------------+-------+---------+-------------------+--------------------------------------------+-------+-----+-------+--------------+------------------------+---------+-----------+--------------+----------+---------+--------+-------------+------------+-----------+--------------------+------+------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+--------+---------+---------------+------------+-------+---------+-----+----------+-----------+-----------+------------+-----------+-------+---------+-------+---------+-------+-------+--------+---------+----------+-------------+-----------+------+---------+---------+-------+--------------+-------------+--------+----------+---------+-------------+----------+--------------+
|_c0|ID    |Name             |Age|Photo                                         

In [30]:
dataFrame.limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [35]:
dataFrame.select("Name","Position").sort("Name").show(5,False)

+-------------+--------+
|Name         |Position|
+-------------+--------+
|A. Abang     |ST      |
|A. Abdellaoui|LB      |
|A. Abdennour |CB      |
|A. Abdi      |CM      |
|A. Abdu Jaber|ST      |
+-------------+--------+
only showing top 5 rows



In [38]:
dataFrame.select("Name","Position").orderBy(dataFrame.Name.desc()).show(5,False)

+--------------+--------+
|Name          |Position|
+--------------+--------+
|Óscar Whalley |GK      |
|Óscar Valentín|CDM     |
|Óscar Plano   |LM      |
|Óscar Pinchi  |LM      |
|Óscar Gil     |RB      |
+--------------+--------+
only showing top 5 rows



In [39]:
dataFrame.select("Name","Position").sort(dataFrame.Name.desc()).show(5,False)

+--------------+--------+
|Name          |Position|
+--------------+--------+
|Óscar Whalley |GK      |
|Óscar Valentín|CDM     |
|Óscar Plano   |LM      |
|Óscar Pinchi  |LM      |
|Óscar Gil     |RB      |
+--------------+--------+
only showing top 5 rows



In [40]:
dataFrame.select("Name","Position").orderBy(dataFrame.Name.asc()).show(5,False)

+-------------+--------+
|Name         |Position|
+-------------+--------+
|A. Abang     |ST      |
|A. Abdellaoui|LB      |
|A. Abdennour |CB      |
|A. Abdi      |CM      |
|A. Abdu Jaber|ST      |
+-------------+--------+
only showing top 5 rows



In [41]:
dataFrame.select("Name","Club").where(dataFrame.Club.startswith("FC")).show(5,False)

+---------------+-----------------+
|Name           |Club             |
+---------------+-----------------+
|L. Messi       |FC Barcelona     |
|L. Suárez      |FC Barcelona     |
|R. Lewandowski |FC Bayern München|
|M. ter Stegen  |FC Barcelona     |
|Sergio Busquets|FC Barcelona     |
+---------------+-----------------+
only showing top 5 rows



In [43]:
dataFrame.select("Name","Age").sort(dataFrame.Age.desc()).show(1,False)

+--------+---+
|Name    |Age|
+--------+---+
|O. Pérez|45 |
+--------+---+
only showing top 1 row



In [46]:
dataFrame.select("Name").where(dataFrame.Name.isin("L. Messi","Cristiano Ronaldo")).show(5,False)

+-----------------+
|Name             |
+-----------------+
|L. Messi         |
|Cristiano Ronaldo|
+-----------------+



In [52]:
dataFrame.select("Name",substring("Release Clause",0,1)).show(5,False)

+-----------------+-------------------------------+
|Name             |substring(Release Clause, 0, 1)|
+-----------------+-------------------------------+
|L. Messi         |€                              |
|Cristiano Ronaldo|€                              |
|Neymar Jr        |€                              |
|De Gea           |€                              |
|K. De Bruyne     |€                              |
+-----------------+-------------------------------+
only showing top 5 rows



In [58]:
dataFrame.select("Name","Age").where(dataFrame.Age>40).show(truncate=False)

+-------------+---+
|Name         |Age|
+-------------+---+
|J. Villar    |41 |
|B. Nivet     |41 |
|O. Pérez     |45 |
|C. Muñoz     |41 |
|S. Narazaki  |42 |
|H. Sulaimani |41 |
|M. Tyler     |41 |
|T. Warner    |44 |
|K. Pilkington|44 |
+-------------+---+

