Starting the spark session

In [34]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Practice").getOrCreate()
spark

Reading the dataset

In [55]:
df_pyspark = spark.read.csv("test.csv")
df_pyspark.show()

+-------+---+----------+
|    _c0|_c1|       _c2|
+-------+---+----------+
|   Name|Age|Experience|
| Faisal| 23|         2|
|  Imran| 28|         5|
| Zishan| 30|        10|
|Aayesha| 21|         1|
| Habiba| 20|         0|
+-------+---+----------+



Setting the header as the first row of the data and identifying the data types of each columns
inferSchema = True identifies the data types of each column 

In [56]:
spark.read.option('header','true').csv('test.csv',inferSchema=True).show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
| Faisal| 23|         2|
|  Imran| 28|         5|
| Zishan| 30|        10|
|Aayesha| 21|         1|
| Habiba| 20|         0|
+-------+---+----------+



In [57]:
df_pyspark.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



Another way of writing it in the same format

In [58]:
df_pyspark=spark.read.csv('test.csv',header=True,inferSchema=True)
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
| Faisal| 23|         2|
|  Imran| 28|         5|
| Zishan| 30|        10|
|Aayesha| 21|         1|
| Habiba| 20|         0|
+-------+---+----------+



Selecting the specific columns

In [59]:
df_pyspark.select(["Name", "Age"]).show()

+-------+---+
|   Name|Age|
+-------+---+
| Faisal| 23|
|  Imran| 28|
| Zishan| 30|
|Aayesha| 21|
| Habiba| 20|
+-------+---+



In [60]:
df_pyspark.describe().show()

+-------+-------+----------------+----------------+
|summary|   Name|             Age|      Experience|
+-------+-------+----------------+----------------+
|  count|      5|               5|               5|
|   mean|   null|            24.4|             3.6|
| stddev|   null|4.39317652729776|4.03732584763727|
|    min|Aayesha|              20|               0|
|    max| Zishan|              30|              10|
+-------+-------+----------------+----------------+



Adding columns in the DataFrame
We will be adding the new column as Experience after 2 years and adding each value of experience to Experience after 2 years

In [65]:
df_pyspark = df_pyspark.withColumn("Experience After 2 yrs", df_pyspark["Experience"]+2)
df_pyspark.show()

+-------+---+----------+----------------------+
|   Name|Age|Experience|Experience After 2 yrs|
+-------+---+----------+----------------------+
| Faisal| 23|         2|                     4|
|  Imran| 28|         5|                     7|
| Zishan| 30|        10|                    12|
|Aayesha| 21|         1|                     3|
| Habiba| 20|         0|                     2|
+-------+---+----------+----------------------+



In [69]:
df_pyspark = df_pyspark.withColumn("Salary", df_pyspark["Experience"]*10000)
df_pyspark.show()

+-------+---+----------+----------------------+------+
|   Name|Age|Experience|Experience After 2 yrs|Salary|
+-------+---+----------+----------------------+------+
| Faisal| 23|         2|                     4| 20000|
|  Imran| 28|         5|                     7| 50000|
| Zishan| 30|        10|                    12|100000|
|Aayesha| 21|         1|                     3| 10000|
| Habiba| 20|         0|                     2|     0|
+-------+---+----------+----------------------+------+



In [70]:
df_pyspark = df_pyspark.withColumn("Expense", df_pyspark["Salary"]-1000)
df_pyspark.show()

+-------+---+----------+----------------------+------+-------+
|   Name|Age|Experience|Experience After 2 yrs|Salary|Expense|
+-------+---+----------+----------------------+------+-------+
| Faisal| 23|         2|                     4| 20000|  19000|
|  Imran| 28|         5|                     7| 50000|  49000|
| Zishan| 30|        10|                    12|100000|  99000|
|Aayesha| 21|         1|                     3| 10000|   9000|
| Habiba| 20|         0|                     2|     0|  -1000|
+-------+---+----------+----------------------+------+-------+



Dropping the Columns

In [73]:
df_pyspark=df_pyspark.drop('Experience After 2 yrs')
df_pyspark.show()

+-------+---+----------+------+-------+
|   Name|Age|Experience|Salary|Expense|
+-------+---+----------+------+-------+
| Faisal| 23|         2| 20000|  19000|
|  Imran| 28|         5| 50000|  49000|
| Zishan| 30|        10|100000|  99000|
|Aayesha| 21|         1| 10000|   9000|
| Habiba| 20|         0|     0|  -1000|
+-------+---+----------+------+-------+



Renaming the columns

In [74]:
df_pyspark.withColumnRenamed('Name','New Name').show()

+--------+---+----------+------+-------+
|New Name|Age|Experience|Salary|Expense|
+--------+---+----------+------+-------+
|  Faisal| 23|         2| 20000|  19000|
|   Imran| 28|         5| 50000|  49000|
|  Zishan| 30|        10|100000|  99000|
| Aayesha| 21|         1| 10000|   9000|
|  Habiba| 20|         0|     0|  -1000|
+--------+---+----------+------+-------+

