# Search and Filter DataFrames in PySpark

In [8]:
%config Completer.use_jedi = False

import os
import sys
import shutil

BASE_DIR = os.path.realpath(os.path.join(os.getcwd(), "..", ".."))

if not BASE_DIR in sys.path:
    sys.path.append(BASE_DIR)
    
from utils import extract_zip

DATASETS_PATH = "datasets/"

In [9]:
BASE_DIR

'/home/rodrigo/Workspace/_learning_/udemy/pyspark-essentials-for-data-scientists'

In [1]:
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SearchFilter").getOrCreate()
spark

In [13]:
data_file = extract_zip(
    zip_file=os.path.join(DATASETS_PATH, "fifa19.csv.zip"), member="fifa19.csv"
)

In [14]:
fifa = spark.read.csv(data_file, inferSchema=True, header=True)
fifa.limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [15]:
fifa.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Photo: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- Overall: integer (nullable = true)
 |-- Potential: integer (nullable = true)
 |-- Club: string (nullable = true)
 |-- Club Logo: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Wage: string (nullable = true)
 |-- Special: integer (nullable = true)
 |-- Preferred Foot: string (nullable = true)
 |-- International Reputation: integer (nullable = true)
 |-- Weak Foot: integer (nullable = true)
 |-- Skill Moves: integer (nullable = true)
 |-- Work Rate: string (nullable = true)
 |-- Body Type: string (nullable = true)
 |-- Real Face: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Jersey Number: integer (nullable = true)
 |-- Joined: string (nullable = true)
 |-- Loaned From: string (nu

In [16]:
from pyspark.sql.functions import *

In [18]:
fifa.select("Nationality", "Name", "Age", "Photo").show(5)

+-----------+-----------------+---+--------------------+
|Nationality|             Name|Age|               Photo|
+-----------+-----------------+---+--------------------+
|  Argentina|         L. Messi| 31|https://cdn.sofif...|
|   Portugal|Cristiano Ronaldo| 33|https://cdn.sofif...|
|     Brazil|        Neymar Jr| 26|https://cdn.sofif...|
|      Spain|           De Gea| 27|https://cdn.sofif...|
|    Belgium|     K. De Bruyne| 27|https://cdn.sofif...|
+-----------+-----------------+---+--------------------+
only showing top 5 rows



In [19]:
fifa.select("Nationality", "Name", "Age", "Photo").show(5, False)

+-----------+-----------------+---+----------------------------------------------+
|Nationality|Name             |Age|Photo                                         |
+-----------+-----------------+---+----------------------------------------------+
|Argentina  |L. Messi         |31 |https://cdn.sofifa.org/players/4/19/158023.png|
|Portugal   |Cristiano Ronaldo|33 |https://cdn.sofifa.org/players/4/19/20801.png |
|Brazil     |Neymar Jr        |26 |https://cdn.sofifa.org/players/4/19/190871.png|
|Spain      |De Gea           |27 |https://cdn.sofifa.org/players/4/19/193080.png|
|Belgium    |K. De Bruyne     |27 |https://cdn.sofifa.org/players/4/19/192985.png|
+-----------+-----------------+---+----------------------------------------------+
only showing top 5 rows



In [21]:
fifa.select(["Name", "Age"]).orderBy(fifa["Age"].desc()).show(5)

+-------------+---+
|         Name|Age|
+-------------+---+
|     O. Pérez| 45|
|K. Pilkington| 44|
|    T. Warner| 44|
|  S. Narazaki| 42|
|     C. Muñoz| 41|
+-------------+---+
only showing top 5 rows



In [23]:
fifa.select(["Name", "Club"]).where(fifa.Club.like("%Barcelona")).show(5)

+---------------+------------+
|           Name|        Club|
+---------------+------------+
|       L. Messi|FC Barcelona|
|      L. Suárez|FC Barcelona|
|  M. ter Stegen|FC Barcelona|
|Sergio Busquets|FC Barcelona|
|       Coutinho|FC Barcelona|
+---------------+------------+
only showing top 5 rows



In [24]:
fifa.select("Photo", fifa.Photo.substr(-4, 4)).show(5, False)

+----------------------------------------------+-----------------------+
|Photo                                         |substring(Photo, -4, 4)|
+----------------------------------------------+-----------------------+
|https://cdn.sofifa.org/players/4/19/158023.png|.png                   |
|https://cdn.sofifa.org/players/4/19/20801.png |.png                   |
|https://cdn.sofifa.org/players/4/19/190871.png|.png                   |
|https://cdn.sofifa.org/players/4/19/193080.png|.png                   |
|https://cdn.sofifa.org/players/4/19/192985.png|.png                   |
+----------------------------------------------+-----------------------+
only showing top 5 rows



In [25]:
fifa[fifa.Club.isin("FC Barcelona", "Juventus")].limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,7,176580,L. Suárez,31,https://cdn.sofifa.org/players/4/19/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,91,91,FC Barcelona,...,85,62,45,38,27,25,31,33,37,€164M
3,15,211110,P. Dybala,24,https://cdn.sofifa.org/players/4/19/211110.png,Argentina,https://cdn.sofifa.org/flags/52.png,89,94,Juventus,...,84,23,20,20,5,4,4,5,8,€153.5M
4,18,192448,M. ter Stegen,26,https://cdn.sofifa.org/players/4/19/192448.png,Germany,https://cdn.sofifa.org/flags/21.png,89,92,FC Barcelona,...,69,25,13,10,87,85,88,85,90,€123.3M


In [26]:
fifa.select("Name", "Club").where(fifa.Name.startswith("L")).where(fifa.Name.endswith("i")).show(5)

+-------------+---------------+
|         Name|           Club|
+-------------+---------------+
|     L. Messi|   FC Barcelona|
|   L. Bonucci|       Juventus|
| L. Fabiański|West Ham United|
|L. Pellegrini|           Roma|
| L. Pavoletti|       Cagliari|
+-------------+---------------+
only showing top 5 rows



In [27]:
fifa.count()

18207

In [31]:
df1 = fifa.limit(100)
df1.count()

100

In [30]:
cols = fifa.columns[:5]
fifa.select(cols).show(5, False)

+---+------+-----------------+---+----------------------------------------------+
|_c0|ID    |Name             |Age|Photo                                         |
+---+------+-----------------+---+----------------------------------------------+
|0  |158023|L. Messi         |31 |https://cdn.sofifa.org/players/4/19/158023.png|
|1  |20801 |Cristiano Ronaldo|33 |https://cdn.sofifa.org/players/4/19/20801.png |
|2  |190871|Neymar Jr        |26 |https://cdn.sofifa.org/players/4/19/190871.png|
|3  |193080|De Gea           |27 |https://cdn.sofifa.org/players/4/19/193080.png|
|4  |192985|K. De Bruyne     |27 |https://cdn.sofifa.org/players/4/19/192985.png|
+---+------+-----------------+---+----------------------------------------------+
only showing top 5 rows



In [34]:
df2 = spark.createDataFrame([([1, 2 , 3],), ([4, 5],)], ["x"])
df2.show()

+---------+
|        x|
+---------+
|[1, 2, 3]|
|   [4, 5]|
+---------+



In [37]:
# the first element on slice is 1 instead of 0

df2.select(slice(df2.x, 2, 2)).show()

+--------------+
|slice(x, 2, 2)|
+--------------+
|        [2, 3]|
|           [5]|
+--------------+



In [38]:
df2.select(slice(df2.x, 2, 2).alias("Some Name")).show()

+---------+
|Some Name|
+---------+
|   [2, 3]|
|      [5]|
+---------+



In [40]:
fifa.filter("Overall > 50").limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [42]:
fifa.select(["Name", "Age"]).filter("Overall > 50").limit(5).toPandas()

Unnamed: 0,Name,Age
0,L. Messi,31
1,Cristiano Ronaldo,33
2,Neymar Jr,26
3,De Gea,27
4,K. De Bruyne,27


##### __NOTE:__

The `where()` function is an alias to `filter()` function,
accordingly the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where)

In [47]:
result = (
    fifa
    .select(["Nationality", "Name", "Age", "Overall"])
    .where("Overall > 50")
    .orderBy(fifa["Overall"].desc())
    .collect()
)

In [50]:
type(result[0])

pyspark.sql.types.Row

In [52]:
print(f"Best Player Over 50: {result[0][1]}")

Best Player Over 50: L. Messi


In [53]:
print(f"Best Player Over 50: {result[-1][1]}")

Best Player Over 50: C. Addai
