# Inspecting a DATAFRAME

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Basics").getOrCreate()
spark

In [None]:
# mounting the google drive to colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# loading the file to sparksession
mdf=spark.read.csv('original.csv',header=True)
mdf.show(1)

+---+----------+---------+------+---------+-------------------+---------+----------+----------+
| id|first_name|last_name|gender|     City|           JobTitle|   Salary|  Latitude| Longitude|
+---+----------+---------+------+---------+-------------------+---------+----------+----------+
|  1|   Melinde|Shilburne|Female|Nowa Ruda|Assistant Professor|$57438.18|50.5774075|16.4967184|
+---+----------+---------+------+---------+-------------------+---------+----------+----------+
only showing top 1 row



In [None]:
mdf.dtypes

[('id', 'string'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('City', 'string'),
 ('JobTitle', 'string'),
 ('Salary', 'string'),
 ('Latitude', 'string'),
 ('Longitude', 'string')]

In [None]:
#changing the col datatypes
from pyspark.sql.types import *
from pyspark.sql.types import StringType, DateType, FloatType

In [None]:
"""#changing the col datatypes
mdf.withColumn('id',mdf['id'].cast('integer')) \
  .withColumn('Latitude',mdf['Latitude'].cast('FloatType')) \
  .withColumn('Longitude',mdf['Longitude'].cast('FloatType'))"""

In [None]:
from pyspark.sql.types import *
# creating my own schema
myschema=StructType([
  StructField('id', IntegerType()),
 StructField('first_name', StringType()),
 StructField('last_name', StringType()),
 StructField('gender', StringType()),
 StructField('City', StringType()),
 StructField('JobTitle', StringType()),
 StructField('Salary', StringType()),
 StructField('Latitude', FloatType()),
 StructField('Longitude', FloatType())])
 #
odf=spark.read.csv('original.csv',header=True,schema=myschema)

In [None]:
odf.dtypes #checking datatype changed
odf.show(10)

+---+----------+----------+------+---------------+--------------------+---------+---------+----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary| Latitude| Longitude|
+---+----------+----------+------+---------------+--------------------+---------+---------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.577408| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60| 48.82316| 103.52182|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52|39.994747|116.339775|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.504723| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|     null| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.426613|-6.1644998|
|  7|     Masha|    Divers|Female|         Dachun|                null|$2

In [None]:
odf.head(3)

[Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', City='Nowa Ruda', JobTitle='Assistant Professor', Salary='$57438.18', Latitude=50.57740783691406, Longitude=16.49671745300293),
 Row(id=2, first_name='Kimberly', last_name='Von Welden', gender='Female', City='Bulgan', JobTitle='Programmer II', Salary='$62846.60', Latitude=48.823158264160156, Longitude=103.52182006835938),
 Row(id=3, first_name='Alvera', last_name='Di Boldi', gender='Female', City=None, JobTitle=None, Salary='$57576.52', Latitude=39.994747161865234, Longitude=116.33977508544922)]

In [None]:
odf.first() # gets first row data

Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', City='Nowa Ruda', JobTitle='Assistant Professor', Salary='$57438.18', Latitude=50.57740783691406, Longitude=16.49671745300293)

In [None]:
odf.describe().show()

+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+------------------+-----------------+
|summary|               id|first_name|last_name|gender|               City|           JobTitle|   Salary|          Latitude|        Longitude|
+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+------------------+-----------------+
|  count|             1000|      1000|     1000|  1000|                999|                998|     1000|               999|             1000|
|   mean|            500.5|      null|     null|  null|               null|               null|     null| 25.43151724702484|43.33756460386515|
| stddev|288.8194360957494|      null|     null|  null|               null|               null|     null|24.579082550156635| 69.4206453674681|
|    min|                1|   Abagail|    Abbay|Female|             Abéché|Account Coordinator|$10101.92|         -54.28115|       -123.04196|

In [None]:
odf.columns

['id',
 'first_name',
 'last_name',
 'gender',
 'City',
 'JobTitle',
 'Salary',
 'Latitude',
 'Longitude']

# Handling the Data and Null values

In [None]:
odf.count() # gets total no. of rows in the df

1000

In [None]:
#dropping the null values in all rows
df_drped = odf.na.drop()
df_drped.show()

In [None]:
# or we do only for specific cols like 'JobTitle' col
dfnull = odf.filter(odf.JobTitle.isNotNull())
dfnull.show()

In [None]:
# creating a new col 'clean city' where if city = null where 'unknown', else same data
from pyspark.sql.functions import *
dfhand = odf.withColumn('clean city',when(odf.City.isNull(),'Unkown').otherwise(odf.City))
dfhand.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+---------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|     clean city|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+---------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|      Nowa Ruda|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|         Bulgan|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|         Unkown|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|  Divnomorskoye|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|      Mytishchi|
|  6|     Maris|      Folk|Female|Kinsea

In [None]:
dfhand.show(4)

In [None]:
dfnodupli = odf.dropDuplicates()
dfnodupli.show() # drops all duplicate data/rows

+---+----------+-------------+------+--------------------+--------------------+---------+----------+----------+
| id|first_name|    last_name|gender|                City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+-------------+------+--------------------+--------------------+---------+----------+----------+
|372|     Lyman|      Burfitt|  Male|             Guiping|Community Outreac...|$28755.53| 23.394325| 110.07938|
|391|     Verge|     Hefferan|  Male|          Cocachacra|Community Outreac...|$90391.71|-17.091843| -71.77114|
|428|   Pernell|      Fossitt|  Male|           Wangchang|    Graphic Designer|$36927.53| 40.401047|  117.9989|
|526|  Garfield|    Benadette|  Male|              Shiren|   Marketing Manager|$56867.78| 28.651703| 117.90413|
|731|     Legra|        Manns|Female|               Nîmes|  Research Associate|$42246.87| 43.844727| 4.3520436|
|809|       Jed|       Shires|  Male|            Goubétto|     Design Engineer|$64130.79| 11.423197| 43.

# Selecting and Filtering Data



*   SELECT() -> for substr(), alias() and few more to find.
*   FILTER() -> for isin(), like(), between(), startswith(), endswith(), comparing operators (>,<,==, etc), and so on yet to find.



In [None]:
odf.select('*').show()

In [None]:
odf.select('first_name','last_name').show(5)

In [None]:
odf.select(odf.JobTitle[10:20]).show(8) # substring

+---------------------------+
|substring(JobTitle, 10, 20)|
+---------------------------+
|                  Professor|
|                       r II|
|                       null|
|        counting Analyst II|
|                           |
|                      ineer|
|                       null|
|          upport Technician|
+---------------------------+
only showing top 8 rows



In [None]:
# selecting the data
dfselect = odf.select("first_name",'last_name')
dfselect.show(4)

+----------+----------+
|first_name| last_name|
+----------+----------+
|   Melinde| Shilburne|
|  Kimberly|Von Welden|
|    Alvera|  Di Boldi|
|   Shannon| O'Griffin|
+----------+----------+
only showing top 4 rows



In [None]:
# renaming the df
df_renamed = odf.withColumnRenamed('first_name','fn')
df_renamed.show(1)

+---+-------+---------+------+---------+-------------------+---------+---------+---------+
| id|     fn|last_name|gender|     City|           JobTitle|   Salary| Latitude|Longitude|
+---+-------+---------+------+---------+-------------------+---------+---------+---------+
|  1|Melinde|Shilburne|Female|Nowa Ruda|Assistant Professor|$57438.18|50.577408|16.496717|
+---+-------+---------+------+---------+-------------------+---------+---------+---------+
only showing top 1 row



In [None]:
odf.show(1)

+---+----------+---------+------+---------+-------------------+---------+---------+---------+
| id|first_name|last_name|gender|     City|           JobTitle|   Salary| Latitude|Longitude|
+---+----------+---------+------+---------+-------------------+---------+---------+---------+
|  1|   Melinde|Shilburne|Female|Nowa Ruda|Assistant Professor|$57438.18|50.577408|16.496717|
+---+----------+---------+------+---------+-------------------+---------+---------+---------+
only showing top 1 row



In [None]:
# performing a filter func to get fname = Alvera
df_filter = odf.filter(odf.first_name == 'Alvera')
df_filter.show()

+---+----------+---------+------+----+--------+---------+---------+----------+
| id|first_name|last_name|gender|City|JobTitle|   Salary| Latitude| Longitude|
+---+----------+---------+------+----+--------+---------+---------+----------+
|  3|    Alvera| Di Boldi|Female|null|    null|$57576.52|39.994747|116.339775|
+---+----------+---------+------+----+--------+---------+---------+----------+



In [None]:
df_filter=odf.filter(odf.first_name.like("%oh%"))
df_filter.show()

+---+----------+---------+------+-------------+--------------------+---------+---------+----------+
| id|first_name|last_name|gender|         City|            JobTitle|   Salary| Latitude| Longitude|
+---+----------+---------+------+-------------+--------------------+---------+---------+----------+
|209|     Johan|Alishoner|  Male|        Banxi|            VP Sales|$95523.91| 41.48698| 123.68514|
|235|   Johanna|   Oiseau|Female|       Sanzao|     General Manager|$70049.47|22.008072| 113.37859|
|431| Johnathan| Spriggin|  Male|Dayr as Sūdān|Senior Financial ...|$78187.65| 32.03213|  35.14844|
|496|    Johann|    Foxon|  Male|       Pasian| Associate Professor|$99421.34|14.634726| 121.01122|
|749|  Mohammed|  Kasting|  Male|     Xiaozhai|Chief Design Engi...|$36990.61| 33.21927|112.528206|
+---+----------+---------+------+-------------+--------------------+---------+---------+----------+



In [None]:
df_filter = odf.filter((odf.first_name.endswith('ala')))
df_filter.show() # .endswith('something')

+---+----------+---------+------+--------------------+----------------+---------+---------+----------+
| id|first_name|last_name|gender|                City|        JobTitle|   Salary| Latitude| Longitude|
+---+----------+---------+------+--------------------+----------------+---------+---------+----------+
|955|     Neala| Harcombe|Female|Figueira Castelo ...|Dental Hygienist|$27831.09|40.894623|-6.9635615|
+---+----------+---------+------+--------------------+----------------+---------+---------+----------+



In [None]:
odf.createOrReplaceTempView('odf')
ci = spark.sql('select substring(City,-20) as small_city from odf where City like "Figueira%" ')
ci.show()

+--------------------+
|          small_city|
+--------------------+
|eira Castelo Rodrigo|
+--------------------+



In [None]:
# finding a JobTitle where startswith 'Senior'
starts = odf.filter(odf.JobTitle.startswith('Senior'))
starts.show()

In [None]:
en_df = odf.filter(odf.JobTitle.endswith("Engineer"))
en_df.show(5)

In [None]:
ch = odf.filter(odf.City.like('Banxi'))
ch.show()

+---+----------+---------+------+-----+--------+---------+--------+---------+
| id|first_name|last_name|gender| City|JobTitle|   Salary|Latitude|Longitude|
+---+----------+---------+------+-----+--------+---------+--------+---------+
|209|     Johan|Alishoner|  Male|Banxi|VP Sales|$95523.91|41.48698|123.68514|
+---+----------+---------+------+-----+--------+---------+--------+---------+



In [None]:
odf.printSchema()

In [None]:
# getting rows from 1 to 5 using id and 'between' funcs
df_btw = odf.filter((odf.id.between(1,5)))
df_btw.select('id','last_name').show()

+---+----------+
| id| last_name|
+---+----------+
|  1| Shilburne|
|  2|Von Welden|
|  3|  Di Boldi|
|  4| O'Griffin|
|  5|   Macieja|
+---+----------+



In [None]:
df_btw2 = odf.filter((odf.first_name.between('johana','johana')))
df_btw2.show()

+---+----------+---------+------+----+--------+------+--------+---------+
| id|first_name|last_name|gender|City|JobTitle|Salary|Latitude|Longitude|
+---+----------+---------+------+----+--------+------+--------+---------+
+---+----------+---------+------+----+--------+------+--------+---------+



In [None]:
# gettting specific fnames using 'isin' on fname col,
df_btw = odf.filter((odf.first_name.isin('Aldin','Velma','Alvera')))
df_btw.show()

+---+----------+-------------+------+-----------+---------------+---------+----------+----------+
| id|first_name|    last_name|gender|       City|       JobTitle|   Salary|  Latitude| Longitude|
+---+----------+-------------+------+-----------+---------------+---------+----------+----------+
|  3|    Alvera|     Di Boldi|Female|       null|           null|$57576.52| 39.994747|116.339775|
|885|     Velma|  Shackleford|Female|Ambelókipoi|Data Coordiator|$16676.56| 37.758293| 20.872854|
|901|     Aldin|Matuszkiewicz|  Male|East London|       Operator|$41468.83|-32.954933| 27.931913|
+---+----------+-------------+------+-----------+---------------+---------+----------+----------+



In [None]:
# using substring to get fnames cols
df_sbstr = odf.select(odf.first_name,odf.first_name.substr(1,6).alias('short name'))
df_sbstr.show(5)

In [None]:
dfstr= odf.select(odf.JobTitle,odf.JobTitle.substr(1,2).alias('abbreviation'),odf.first_name)
dfstr.show()

In [None]:
dfc=odf.select(odf.JobTitle,odf.JobTitle.substr(5,10).alias('abbre'),odf.Salary,odf.id) \
       .filter(odf.id.between(10,20)) \
       .filter(odf.JobTitle.like("Senior%"))
dfc.show() #tried something new and it worked

+--------------------+----------+---------+---+
|            JobTitle|     abbre|   Salary| id|
+--------------------+----------+---------+---+
|Senior Financial ...|or Financi|$91925.08| 19|
+--------------------+----------+---------+---+



In [None]:
from pyspark.sql import Column
from pyspark.sql.functions import upper,sum, desc,col

# Applying Multiple Filters

**we can apply multiple FILTERS by using filter((func1) & (func2))**

In [None]:
# now applying multiple filters on df
dfmf = odf.filter((odf.first_name.isin('Aldin','Thain')) & (odf.last_name.like("%bb%"))) \
          .select(odf.first_name,odf.last_name,odf.id)
dfmf.show()

+----------+---------+---+
|first_name|last_name| id|
+----------+---------+---+
|     Thain|   Habbon| 18|
+----------+---------+---+



In [None]:
# we must apply 1st select() func then 2nd filter() !!!!
dfmf= odf.select(odf.JobTitle,odf.last_name) \
         .filter((odf.JobTitle.like("%Engineer%")) & (odf.last_name.isin('Lockart','Habbon')))
dfmf.show()

+--------------------+---------+
|            JobTitle|last_name|
+--------------------+---------+
|Nuclear Power Eng...|  Lockart|
|     Design Engineer|   Habbon|
+--------------------+---------+



In [None]:
df_filter = odf.filter((odf.id > 10) & (odf.id < 20))
df_filter.show()

9

In [None]:
dfs =  odf.select(odf.first_name.substr(0,10).alias('simple'),odf.first_name,odf.last_name)
dfs.show()

+--------+----------+----------+
|  simple|first_name| last_name|
+--------+----------+----------+
| Melinde|   Melinde| Shilburne|
|Kimberly|  Kimberly|Von Welden|
|  Alvera|    Alvera|  Di Boldi|
| Shannon|   Shannon| O'Griffin|
|Sherwood|  Sherwood|   Macieja|
|   Maris|     Maris|      Folk|
|   Masha|     Masha|    Divers|
| Goddart|   Goddart|     Flear|
|    Roth|      Roth|O'Cannavan|
|    Bran|      Bran|   Trahear|
|  Kylynn|    Kylynn|   Lockart|
|     Rey|       Rey|    Meharg|
|    Kerr|      Kerr|    Braden|
|  Mickie|    Mickie| Whanstall|
|  Kaspar|    Kaspar|     Pally|
|  Norbie|    Norbie|    Gwyllt|
|  Claude|    Claude|    Briant|
|   Thain|     Thain|    Habbon|
|Tiffanie|  Tiffanie|  Pattison|
|  Ettore|    Ettore|  Gerriets|
+--------+----------+----------+
only showing top 20 rows



# *Running SQL on DF*


In [None]:
# registering a temperory table
odf.registerTempTable('ori')

In [None]:
# a simple sql query
q = spark.sql('select * from ori limit 10')
q.show()

In [None]:
# some queries
q =spark.sql('select * from ori where first_name like "%ana%"')
q.show()

+---+----------+----------+------+-----------+--------------------+---------+----------+---------+
| id|first_name| last_name|gender|       City|            JobTitle|   Salary|  Latitude|Longitude|
+---+----------+----------+------+-----------+--------------------+---------+----------+---------+
|110| Annadiana|    Keward|Female|     Jiashi|           Paralegal|$23456.67| 39.488182| 76.72372|
|118|      Jana|  Corinton|Female|   Hongmiao|Desktop Support T...|$40968.05| 31.933973|118.67034|
|266|   Stevana|    Tawton|Female|  Salvacion|            VP Sales|$90210.70|   12.6151|  125.039|
|454|    Shanan|  Baudasso|  Male|      Mörön|      Tax Accountant|$77940.08|  49.64289|100.17719|
|738|     Shana|Hanselmann|Female|Lodan Wetan|Senior Cost Accou...|$84370.64|-6.7967067|111.62135|
|754|    Janaya|    Oulner|Female|      Lobuk|  Nurse Practicioner|$12908.38| -8.650979|116.32494|
|792|   Morgana|       Kew|Female|     Duozhu|        Engineer III|$71416.86| 23.027798|114.95018|
+---+-----

In [None]:
q = spark.sql('select City,JobTitle, Gender from ori where Gender = "Male" order by id ')
q.show()

In [None]:
q2 = spark.sql('select concat(first_name," ",last_name) as full_name, Gender,JobTitle from ori where JobTitle like "%ccountant%" order by id')
q2.show()

In [None]:
er = spark.sql('select * from ori where Latitude = (select sum(Latitude) from ori)')
er.show()

+---+----------+---------+------+----+--------+------+--------+---------+
| id|first_name|last_name|gender|City|JobTitle|Salary|Latitude|Longitude|
+---+----------+---------+------+----+--------+------+--------+---------+
+---+----------+---------+------+----+--------+------+--------+---------+



# Adding Calculated Columns

**withColumn('new_colname',col_func())**

In [None]:
from pyspark.sql.functions import *
# removing $ symbol on salary col and changind datatype ot 'float'
odf = odf.withColumn('clean_salary',odf.Salary.substr(2,100).cast('float'))
odf.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|clean_salary|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|    57438.18|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|     62846.6|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|    57576.52|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|    61489.23|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      null| 37.648994|    63863.09|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil En

In [None]:
odf = odf.withColumn('monthly_sal',odf.clean_salary/12)
# getting mnthly salary of each employee on a new col
odf.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+------------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|clean_salary|       monthly_sal|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+------------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18| 50.577408| 16.496717|    57438.18| 4786.514973958333|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|  48.82316| 103.52182|     62846.6|    5237.216796875|
|  3|    Alvera|  Di Boldi|Female|           null|                null|$57576.52| 39.994747|116.339775|    57576.52| 4798.043294270833|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23| 44.504723| 38.130016|    61489.23|   5124.1025390625|
|  5|  Sherwood|   Macieja|  Male|      Mytishch

In [None]:
odf = odf.withColumn('are_they_female',when(odf.gender =='Female','Yes').otherwise('No'))
odf.show()

In [None]:
odf= odf.withColumn('professor',when(odf.JobTitle == 'Assistant Professor', 'He is a Guru').otherwise("Not a gurroo"))
guru= odf.filter(odf.professor.like("%is a%"))
guru.show()

+---+------------+---------+------+------------+-------------------+---------+---------+----------+------------+------------------+---------------+------------+
| id|  first_name|last_name|gender|        City|           JobTitle|   Salary| Latitude| Longitude|clean_salary|       monthly_sal|are_they_female|   professor|
+---+------------+---------+------+------------+-------------------+---------+---------+----------+------------+------------------+---------------+------------+
|  1|     Melinde|Shilburne|Female|   Nowa Ruda|Assistant Professor|$57438.18|50.577408| 16.496717|    57438.18| 4786.514973958333|            Yes|He is a Guru|
| 39|      Valida| Salzberg|Female| Pangnirtung|Assistant Professor|$94224.48| 66.14511| -65.71252|    94224.48| 7852.039713541667|            Yes|He is a Guru|
|102|      Olivia| Tregidgo|Female|       Dahua|Assistant Professor|$28120.01|23.736458|107.998146|    28120.01|2343.3341471354165|            Yes|He is a Guru|
|130|         Bee| Lacrouts|Female

In [None]:
import pyspark.sql.functions as sqlfunc

**groupBy()** func

In [None]:
gfg = odf.groupBy('JobTitle').agg(sqlfunc.sum('clean_salary'))
gfg.show()

+--------------------+-----------------+
|            JobTitle|sum(clean_salary)|
+--------------------+-----------------+
|Systems Administr...|  264525.69921875|
|   Media Manager III|   140905.0703125|
|  Recruiting Manager| 367391.685546875|
|       Geologist III|  133739.40234375|
|        Geologist II|   86587.73046875|
|Database Administ...|    52018.4609375|
|   Financial Analyst|  629598.33203125|
|  Analyst Programmer|    374490.921875|
|Software Engineer II|     74782.640625|
|       Accountant IV|  165464.49609375|
|    Product Engineer|   622904.2734375|
|Software Test Eng...| 356046.427734375|
|Safety Technician...|  29421.529296875|
|    Junior Executive|   391575.3046875|
|Systems Administr...|      154118.4375|
|Human Resources A...| 162527.759765625|
|        VP Marketing| 364954.279296875|
|  Environmental Tech| 356207.224609375|
|Mechanical System...|         908308.5|
| Assistant Professor|   490887.2421875|
+--------------------+-----------------+
only showing top

In [None]:
https://colab.research.google.com/drive/13leHpMye6h3pDZX0jTQ2KiNuSyjQjprF#scrollTo=gJ3qqrRDYWCd&line=2&uniqifier=1

In [None]:
#finding avg,total,min,max salaries both genders using groupBy()
gdf = odf.groupBy('gender').agg(sqlfunc.sum('clean_salary').alias('total'),
                                sqlfunc.avg('clean_salary').alias('average'),
                                sqlfunc.min('clean_salary').alias('min'),
                                sqlfunc.max('clean_salary').alias('max'))
gdf.show()

+------+--------------------+-----------------+--------+--------+
|gender|               total|          average|     min|     max|
+------+--------------------+-----------------+--------+--------+
|Female|2.7364519950195312E7|55618.94298820185|10616.44|99948.28|
|  Male|2.8123435678710938E7|55361.09385573019|10101.92|99942.92|
+------+--------------------+-----------------+--------+--------+



In [None]:
gdf = odf.groupBy('gender','city').agg(sqlfunc.sum('clean_salary').alias('total'),
                                sqlfunc.avg('clean_salary').alias('average'),
                                sqlfunc.min('clean_salary').alias('min'),
                                sqlfunc.max('clean_salary').alias('max'))
gdf.show()

# Writing DataFrames to Files ;-

In [None]:
# we can write DF to files like [df.write.file_format('file_name)]
"""
gdf.write.csv('gdf.csv')
gdf.write.json('gdf.json')
gdf.write.parquet('gdf.parquet')"""

"\ngdf.write.csv('gdf.csv')\ngdf.write.json('gdf.json')\ngdf.write.parquet('gdf.parquet')"

In [None]:
gfg.write.json('gfj.json')

In [None]:
/content/gfj.json/.part-00000-3bd9a76e-addf-4e37-8e81-1d4153884027-c000.json

# a CHALLENGE

In [None]:
cdf = spark.read.csv('challenge.csv',header=True,schema=mschema)
cdf.show()

+---------------+--------------+-----------------+---------+
|             ip|       country|       domainname|bytesused|
+---------------+--------------+-----------------+---------+
|  52.81.192.172|         China| odnoklassniki.ru|      463|
| 119.239.207.13|         China|         youtu.be|       51|
|  68.69.217.210|         China|        adobe.com|       10|
|   7.191.21.223|      Bulgaria|     linkedin.com|      853|
|   211.13.10.68|     Indonesia|          hud.gov|       29|
|   239.80.21.97|      Suriname|       smh.com.au|      218|
|106.214.106.233|       Jamaica|    amazonaws.com|       95|
| 127.242.24.138|         China| surveymonkey.com|      123|
|     99.2.6.139|Czech Republic|     geocities.jp|      322|
|   237.54.11.63|         China|       amazon.com|       83|
| 252.141.157.25|         Japan|      cornell.edu|      374|
|185.220.128.248|       Belgium|       weebly.com|      389|
|   151.77.19.45|   Afghanistan|independent.co.uk|      282|
|  9.161.158.225|     In

In [None]:
cdf.printSchema()

root
 |-- ip_address: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Domain Name: string (nullable = true)
 |-- Bytes_used: string (nullable = true)



In [None]:
mschema=StructType([
  StructField('ip', StringType()),
 StructField('country', StringType()),
 StructField('domainname', StringType()),
 StructField('bytesused', IntegerType())])
# file loading,df and schema creation complete


In [None]:
# 1 task: add a column to say yes or no to whether the country is Mexico
cd1 = cdf.withColumn('mexico',when(cdf.country == 'Mexico','Yes').otherwise('Nope'))
cd1.show()

In [None]:
# task 2 : group by the new column and sum bytesused
cd =cdf.groupBy('mexico').agg(sqlfunc.sum('bytesused').alias('byte_count'))
cd.show()

In [None]:
# task 3 : gourp by country and use the sqlfunc.countDistinct func to calculate no of ip addresses seen in each country
cd2 = cdf.groupBy('country').agg(sqlfunc.countDistinct('ip').alias('count ips'))
cd2.sort(col("count ips").desc()).show()

+--------------+---------+
|       country|count ips|
+--------------+---------+
|         China|      172|
|     Indonesia|      114|
|   Philippines|       65|
|        Russia|       56|
|        Brazil|       35|
|        Poland|       31|
|        Sweden|       28|
|         Japan|       25|
|Czech Republic|       23|
|      Portugal|       23|
|        France|       21|
|          Peru|       19|
|      Colombia|       17|
| United States|       15|
|     Argentina|       14|
|       Ukraine|       14|
|        Mexico|       13|
|      Thailand|       12|
|       Nigeria|       11|
|        Canada|       11|
+--------------+---------+
only showing top 20 rows

