## Search and Filter Dataframes in PySpark

In [2]:
#Importing pyspark and creating a pyspark session

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SearchandFilter").getOrCreate()

In [3]:
spark

In [4]:
#Loading the dataset
path = 'datasets-intro/'
fifa = spark.read.csv(path+'fifa19.csv', inferSchema=True, header=True)

In [6]:
fifa.limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [7]:
fifa.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Photo: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- Overall: integer (nullable = true)
 |-- Potential: integer (nullable = true)
 |-- Club: string (nullable = true)
 |-- Club Logo: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Wage: string (nullable = true)
 |-- Special: integer (nullable = true)
 |-- Preferred Foot: string (nullable = true)
 |-- International Reputation: integer (nullable = true)
 |-- Weak Foot: integer (nullable = true)
 |-- Skill Moves: integer (nullable = true)
 |-- Work Rate: string (nullable = true)
 |-- Body Type: string (nullable = true)
 |-- Real Face: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Jersey Number: integer (nullable = true)
 |-- Joined: string (nullable = true)
 |-- Loaned From: string (nu

In [8]:
#Importing sql functions in order to search and filter using sql functions
from pyspark.sql.functions import *

In [9]:
#Selecting name and position of each player in the dataframe
fifa.select('Name','Position').show(5)

+-----------------+--------+
|             Name|Position|
+-----------------+--------+
|         L. Messi|      RF|
|Cristiano Ronaldo|      ST|
|        Neymar Jr|      LW|
|           De Gea|      GK|
|     K. De Bruyne|     RCM|
+-----------------+--------+
only showing top 5 rows



In [10]:
#Display the same results as above sorted by player names
fifa.select('Name','Position').orderBy('Name').show(5)

+-------------+--------+
|         Name|Position|
+-------------+--------+
|     A. Abang|      ST|
|A. Abdellaoui|      LB|
| A. Abdennour|      CB|
|      A. Abdi|      CM|
|A. Abdu Jaber|      ST|
+-------------+--------+
only showing top 5 rows



In [11]:
#Select only players who belong to a club beginning with FC
fifa.select('Name','Club').where(fifa.Club.like('FC%')).show(5)

+---------------+-----------------+
|           Name|             Club|
+---------------+-----------------+
|       L. Messi|     FC Barcelona|
|      L. Suárez|     FC Barcelona|
| R. Lewandowski|FC Bayern München|
|  M. ter Stegen|     FC Barcelona|
|Sergio Busquets|     FC Barcelona|
+---------------+-----------------+
only showing top 5 rows



In [12]:
#Another approach
fifa.select('Name','Club').where(fifa.Club.startswith('FC')).limit(5).toPandas()

Unnamed: 0,Name,Club
0,L. Messi,FC Barcelona
1,L. Suárez,FC Barcelona
2,R. Lewandowski,FC Bayern München
3,M. ter Stegen,FC Barcelona
4,Sergio Busquets,FC Barcelona


In [13]:
#Oldest player in the dataset and how old are they
fifa.select('Name','Age').sort(desc('Age')).show(1)

+--------+---+
|    Name|Age|
+--------+---+
|O. Pérez| 45|
+--------+---+
only showing top 1 row



In [15]:
fifa.select('Name','Age').orderBy(desc('Age')).show(1)

+--------+---+
|    Name|Age|
+--------+---+
|O. Pérez| 45|
+--------+---+
only showing top 1 row



In [16]:
#Selecting only specific players from the dataframe
fifa[fifa.Name.isin('L. Messi','Cristiano Ronaldo')].limit(4).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M


In [17]:
#Select the first character from Release Clause variable which indicates the currency used
fifa.select('Release Clause', fifa['Release Clause'].substr(1,1)).show(5)

+--------------+-------------------------------+
|Release Clause|substring(Release Clause, 1, 1)|
+--------------+-------------------------------+
|       €226.5M|                              €|
|       €127.1M|                              €|
|       €228.1M|                              €|
|       €138.6M|                              €|
|       €196.4M|                              €|
+--------------+-------------------------------+
only showing top 5 rows



In [18]:
#Select only the players who are over the age of 40
fifa.filter('Age>40').limit(4).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,1120,156092,J. Villar,41,https://cdn.sofifa.org/players/4/19/156092.png,Paraguay,https://cdn.sofifa.org/flags/58.png,77,77,,...,55,13,13,14,75,75,74,78,77,
1,4228,3665,B. Nivet,41,https://cdn.sofifa.org/players/4/19/3665.png,France,https://cdn.sofifa.org/flags/18.png,71,71,ESTAC Troyes,...,82,58,56,43,11,7,8,14,7,
2,4741,140029,O. Pérez,45,https://cdn.sofifa.org/players/4/19/140029.png,Mexico,https://cdn.sofifa.org/flags/83.png,71,71,Pachuca,...,62,23,12,11,70,64,65,73,74,€272K
3,7225,142998,C. Muñoz,41,https://cdn.sofifa.org/players/4/19/142998.png,Argentina,https://cdn.sofifa.org/flags/52.png,68,68,CD Universidad de Concepción,...,62,18,14,19,67,65,68,71,68,€84K
