## Outline
 - Introduce PySparks SQL funtions library 
 - Select method
 - Order By
 - Like Operator (for searching a string)
 - Substring Search
 - Is In Operator
 - Starts with, Ends with
 - Slicing
 - Filtering
 - Collecting Results as Objects

In [2]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SearchandF").getOrCreate()
spark

In [4]:
path = "Datasets/"
fifa = spark.read.csv(path + 'fifa19.csv', inferSchema=True, header=True)

In [5]:
fifa.limit(4).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M


In [6]:
fifa.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Photo: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- Overall: integer (nullable = true)
 |-- Potential: integer (nullable = true)
 |-- Club: string (nullable = true)
 |-- Club Logo: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Wage: string (nullable = true)
 |-- Special: integer (nullable = true)
 |-- Preferred Foot: string (nullable = true)
 |-- International Reputation: integer (nullable = true)
 |-- Weak Foot: integer (nullable = true)
 |-- Skill Moves: integer (nullable = true)
 |-- Work Rate: string (nullable = true)
 |-- Body Type: string (nullable = true)
 |-- Real Face: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Jersey Number: integer (nullable = true)
 |-- Joined: string (nullable = true)
 |-- Loaned From: string (nu

In [7]:
from pyspark.sql.functions import *

In [10]:
fifa.select(['Nationality', 'Name', 'Age', 'Photo']).show(5, False)

+-----------+-----------------+---+----------------------------------------------+
|Nationality|Name             |Age|Photo                                         |
+-----------+-----------------+---+----------------------------------------------+
|Argentina  |L. Messi         |31 |https://cdn.sofifa.org/players/4/19/158023.png|
|Portugal   |Cristiano Ronaldo|33 |https://cdn.sofifa.org/players/4/19/20801.png |
|Brazil     |Neymar Jr        |26 |https://cdn.sofifa.org/players/4/19/190871.png|
|Spain      |De Gea           |27 |https://cdn.sofifa.org/players/4/19/193080.png|
|Belgium    |K. De Bruyne     |27 |https://cdn.sofifa.org/players/4/19/192985.png|
+-----------+-----------------+---+----------------------------------------------+
only showing top 5 rows



In [12]:
# orderBy
fifa.select(['Name', 'Age']).orderBy(fifa['Age']).show(5)

+------------+---+
|        Name|Age|
+------------+---+
|   B. Nygren| 16|
|H. Andersson| 16|
|    A. Doğan| 16|
|  C. Bassett| 16|
|    B. Mumba| 16|
+------------+---+
only showing top 5 rows



In [13]:
fifa.select(['Name', 'Age']).orderBy(fifa['Age'].desc()).show(5)

+-------------+---+
|         Name|Age|
+-------------+---+
|     O. Pérez| 45|
|K. Pilkington| 44|
|    T. Warner| 44|
|  S. Narazaki| 42|
|     C. Muñoz| 41|
+-------------+---+
only showing top 5 rows



In [14]:
# like
fifa.select(['Name', 'Club']).where(fifa.Club.like('%Barcelona%')).show(5,False)

+---------------+------------+
|Name           |Club        |
+---------------+------------+
|L. Messi       |FC Barcelona|
|L. Suárez      |FC Barcelona|
|M. ter Stegen  |FC Barcelona|
|Sergio Busquets|FC Barcelona|
|Coutinho       |FC Barcelona|
+---------------+------------+
only showing top 5 rows



In [15]:
fifa.select("Photo", fifa.Photo.substr(-4,4)).show(5, False)

+----------------------------------------------+-----------------------+
|Photo                                         |substring(Photo, -4, 4)|
+----------------------------------------------+-----------------------+
|https://cdn.sofifa.org/players/4/19/158023.png|.png                   |
|https://cdn.sofifa.org/players/4/19/20801.png |.png                   |
|https://cdn.sofifa.org/players/4/19/190871.png|.png                   |
|https://cdn.sofifa.org/players/4/19/193080.png|.png                   |
|https://cdn.sofifa.org/players/4/19/192985.png|.png                   |
+----------------------------------------------+-----------------------+
only showing top 5 rows



In [16]:
# isin
fifa[fifa.Club.isin("FC Barcelona", "Juventus")].limit(4).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,7,176580,L. Suárez,31,https://cdn.sofifa.org/players/4/19/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,91,91,FC Barcelona,...,85,62,45,38,27,25,31,33,37,€164M
3,15,211110,P. Dybala,24,https://cdn.sofifa.org/players/4/19/211110.png,Argentina,https://cdn.sofifa.org/flags/52.png,89,94,Juventus,...,84,23,20,20,5,4,4,5,8,€153.5M


In [17]:
fifa.select("Name", "Club").where(fifa.Name.startswith("L")).where(fifa.Name.endswith("i")).show(5)

+-------------+---------------+
|         Name|           Club|
+-------------+---------------+
|     L. Messi|   FC Barcelona|
|   L. Bonucci|       Juventus|
| L. Fabiański|West Ham United|
|L. Pellegrini|           Roma|
| L. Pavoletti|       Cagliari|
+-------------+---------------+
only showing top 5 rows



In [18]:
# slicing
fifa.count()

18207

In [19]:
df = fifa.limit(100)
df.count()

100

In [20]:
col_list = fifa.columns[0:5]
df3 = fifa.select(col_list)
df3.show()

+---+------+-----------------+---+--------------------+
|_c0|    ID|             Name|Age|               Photo|
+---+------+-----------------+---+--------------------+
|  0|158023|         L. Messi| 31|https://cdn.sofif...|
|  1| 20801|Cristiano Ronaldo| 33|https://cdn.sofif...|
|  2|190871|        Neymar Jr| 26|https://cdn.sofif...|
|  3|193080|           De Gea| 27|https://cdn.sofif...|
|  4|192985|     K. De Bruyne| 27|https://cdn.sofif...|
|  5|183277|        E. Hazard| 27|https://cdn.sofif...|
|  6|177003|        L. Modrić| 32|https://cdn.sofif...|
|  7|176580|        L. Suárez| 31|https://cdn.sofif...|
|  8|155862|     Sergio Ramos| 32|https://cdn.sofif...|
|  9|200389|         J. Oblak| 25|https://cdn.sofif...|
| 10|188545|   R. Lewandowski| 29|https://cdn.sofif...|
| 11|182521|         T. Kroos| 28|https://cdn.sofif...|
| 12|182493|         D. Godín| 32|https://cdn.sofif...|
| 13|168542|      David Silva| 32|https://cdn.sofif...|
| 14|215914|         N. Kanté| 27|https://cdn.so

In [21]:
len(df3.columns)

5

In [22]:
df = spark.createDataFrame([([1,2,3],),([4,5],)],['x'])
df.show()

+---------+
|        x|
+---------+
|[1, 2, 3]|
|   [4, 5]|
+---------+



## Filtering Data

In [24]:
fifa.filter("Overall > 50").limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [25]:
fifa.filter("Overall=50").select(['Name', 'Age']).limit(5).toPandas()

Unnamed: 0,Name,Age
0,T. Jacobsen,20
1,J. Adebayo-Smith,17
2,Han Zilong,24
3,B. O'Gorman,16
4,S. Al Wehimid,20


In [27]:
# collect() -> return python object
result = fifa.filter("Overall>50").select(['Nationality','Name', 'Age', 'Overall']).orderBy(fifa['Overall'].desc()).collect()

In [29]:
type(result[0])

pyspark.sql.types.Row

In [31]:
print("Best Player Over 50: ", result[0][1])

Best Player Over 50:  L. Messi
