# Dataframe tutorial - Basic
In this tutorial we will cover
- Pyspark dataframe
- Reading data into dataframe
- Checking the schema and datatypes
- adding/renaming and deleting columns
- filtering rows
- fill missing values

In [144]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('dataframe').getOrCreate()

spark

In [31]:
movies_df = spark.read.option("delimiter", "::").csv("../sample_data/movie-lense/movies.dat", inferSchema=True, header=True)

In [26]:
movies_df.show()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Animation|Childre...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|        Comedy|Drama|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|Adventure|Children's|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Animation|Children's|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|      Drama|Thriller|
|     17|Sen

In [33]:
movies_df.head(3)

[Row(MovieID=1, Title='Toy Story (1995)', Genres="Animation|Children's|Comedy"),
 Row(MovieID=2, Title='Jumanji (1995)', Genres="Adventure|Children's|Fantasy"),
 Row(MovieID=3, Title='Grumpier Old Men (1995)', Genres='Comedy|Romance')]

In [27]:
movies_df.printSchema()

root
 |-- MovieID: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Genres: string (nullable = true)



In [28]:
movies_df.count()

3883

In [32]:
type(movies_df)

pyspark.sql.dataframe.DataFrame

In [38]:
movies_df.select(['MovieID', 'Title']).show()

+-------+--------------------+
|MovieID|               Title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|      Jumanji (1995)|
|      3|Grumpier Old Men ...|
|      4|Waiting to Exhale...|
|      5|Father of the Bri...|
|      6|         Heat (1995)|
|      7|      Sabrina (1995)|
|      8| Tom and Huck (1995)|
|      9| Sudden Death (1995)|
|     10|    GoldenEye (1995)|
|     11|American Presiden...|
|     12|Dracula: Dead and...|
|     13|        Balto (1995)|
|     14|        Nixon (1995)|
|     15|Cutthroat Island ...|
|     16|       Casino (1995)|
|     17|Sense and Sensibi...|
|     18|   Four Rooms (1995)|
|     19|Ace Ventura: When...|
|     20|  Money Train (1995)|
+-------+--------------------+
only showing top 20 rows



In [39]:
movies_df.dtypes

[('MovieID', 'int'), ('Title', 'string'), ('Genres', 'string')]

In [40]:
movies_df.describe().show()

+-------+------------------+--------------------+-------+
|summary|           MovieID|               Title| Genres|
+-------+------------------+--------------------+-------+
|  count|              3883|                3883|   3883|
|   mean|1986.0494463044038|                NULL|   NULL|
| stddev|1146.7783494728876|                NULL|   NULL|
|    min|                 1|$1,000,000 Duck (...| Action|
|    max|              3952|     eXistenZ (1999)|Western|
+-------+------------------+--------------------+-------+



In [80]:
# adding new derived column
movies_df = movies_df.withColumn('Release Year', movies_df.Title.substr(-5,4))
movies_df.show()

+-------+--------------------+--------------------+------------+
|MovieID|               Title|              Genres|Release Year|
+-------+--------------------+--------------------+------------+
|      1|    Toy Story (1995)|Animation|Childre...|        1995|
|      2|      Jumanji (1995)|Adventure|Childre...|        1995|
|      3|Grumpier Old Men ...|      Comedy|Romance|        1995|
|      4|Waiting to Exhale...|        Comedy|Drama|        1995|
|      5|Father of the Bri...|              Comedy|        1995|
|      6|         Heat (1995)|Action|Crime|Thri...|        1995|
|      7|      Sabrina (1995)|      Comedy|Romance|        1995|
|      8| Tom and Huck (1995)|Adventure|Children's|        1995|
|      9| Sudden Death (1995)|              Action|        1995|
|     10|    GoldenEye (1995)|Action|Adventure|...|        1995|
|     11|American Presiden...|Comedy|Drama|Romance|        1995|
|     12|Dracula: Dead and...|       Comedy|Horror|        1995|
|     13|        Balto (1

In [83]:
# renaming a column
movies_df = movies_df.withColumnRenamed('Release Year', 'ReleaseYear')
movies_df.show()

+-------+--------------------+--------------------+-----------+
|MovieID|               Title|              Genres|ReleaseYear|
+-------+--------------------+--------------------+-----------+
|      1|    Toy Story (1995)|Animation|Childre...|       1995|
|      2|      Jumanji (1995)|Adventure|Childre...|       1995|
|      3|Grumpier Old Men ...|      Comedy|Romance|       1995|
|      4|Waiting to Exhale...|        Comedy|Drama|       1995|
|      5|Father of the Bri...|              Comedy|       1995|
|      6|         Heat (1995)|Action|Crime|Thri...|       1995|
|      7|      Sabrina (1995)|      Comedy|Romance|       1995|
|      8| Tom and Huck (1995)|Adventure|Children's|       1995|
|      9| Sudden Death (1995)|              Action|       1995|
|     10|    GoldenEye (1995)|Action|Adventure|...|       1995|
|     11|American Presiden...|Comedy|Drama|Romance|       1995|
|     12|Dracula: Dead and...|       Comedy|Horror|       1995|
|     13|        Balto (1995)|Animation|

In [85]:
# dropping a column
movies_df = movies_df.drop('ReleaseYear')
movies_df.show()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Animation|Childre...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|        Comedy|Drama|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|Adventure|Children's|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Animation|Children's|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|      Drama|Thriller|
|     17|Sen

In [130]:
# filtering rows
movies_df.filter('movieid<16').select('movieid', 'title').show()

+-------+--------------------+
|movieid|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|      Jumanji (1995)|
|      3|Grumpier Old Men ...|
|      4|Waiting to Exhale...|
|      5|Father of the Bri...|
|      6|         Heat (1995)|
|      7|      Sabrina (1995)|
|      8| Tom and Huck (1995)|
|      9| Sudden Death (1995)|
|     10|    GoldenEye (1995)|
|     11|American Presiden...|
|     12|Dracula: Dead and...|
|     13|        Balto (1995)|
|     14|        Nixon (1995)|
|     15|Cutthroat Island ...|
+-------+--------------------+



In [131]:
# filtering rows
movies_df.filter('movieid<16 AND movieid>5').show()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|Adventure|Children's|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Animation|Children's|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
+-------+--------------------+--------------------+



In [143]:
movies_df.filter((movies_df['movieid'] < 16) & (movies_df['movieid'] > 5)).show()

+-------+--------------------+--------------------+
|MovieID|               Title|              Genres|
+-------+--------------------+--------------------+
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|Adventure|Children's|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Animation|Children's|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
+-------+--------------------+--------------------+



In [93]:
emp_df = spark.read.csv('../sample_data/employee_data/test.csv', header=True, inferSchema=True)
emp_df.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [95]:
# drop rows that have null value for any column
emp_df.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [96]:
# another way
emp_df.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [100]:
# apply threshold 
emp_df.na.drop(how='any', thresh=2).show() # keep rows that have at least 2 vlues

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [103]:
# drop based on subset columns
emp_df.na.drop(how='any', subset=['age']).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
|     NULL| 36|      NULL|  NULL|
+---------+---+----------+------+



In [105]:
# drop based on subset columns
emp_df.na.drop(how='any', subset=['age', 'salary']).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
+---------+---+----------+------+



In [117]:
# fill missing values with a string
emp_df.fillna({'name':'Missing', 'age':50}).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|   Mahesh| 50|      NULL| 40000|
|  Missing| 34|        10| 38000|
|  Missing| 36|      NULL|  NULL|
+---------+---+----------+------+



In [121]:
# fill age, experiance and salary with the min value of respective column
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'], 
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
    ).setStrategy("median")

imputer.fit(emp_df).transform(emp_df).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         29|                 4|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 4|         20000|
+---------+----+----------+-