-----------------------------------------------------
Created on: 29-08-2022                              
Author: Rohit Sharma                                
                                                     
-----------------------------------------------------

**In this notebook: -**

* PySpark DataFrame
* Reading the Dataset
* Checking the datatypes of the column(schema)
* Selecting columns and indexing
* Check describe option similar to Pandas
* Adding columns
* Dropping columns
* Renaming columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("DataFrame").getOrCreate()

In [3]:
spark

In [9]:
# Read the dataset
df_pyspark = spark.read.option('header','true').csv('test1.csv', inferSchema=True)  #inferSchema is used to show right datatype of columns

In [10]:
# Check the schema(datatype)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [11]:
#another way to read the data
df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema=True)
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|    Ram| 28|         1|
|  Rohan| 27|         3|
|  Mohan| 25|         4|
|    Raj| 20|         2|
|   Neha| 24|         2|
|Prateek| 23|         5|
|  Mukul| 26|         3|
|  Disha| 26|         5|
|  Namit| 27|         6|
+-------+---+----------+



In [12]:
# check type of df_pyspark
# What is dataframe? --> Dataframe is a data structure.
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [13]:
# Selecting columns and indexing
df_pyspark.columns

['Name', 'Age', 'Experience']

In [14]:
df_pyspark.head(3)

[Row(Name='Ram', Age=28, Experience=1),
 Row(Name='Rohan', Age=27, Experience=3),
 Row(Name='Mohan', Age=25, Experience=4)]

In [17]:
#Select a particular column
df_pyspark.select("Name").show()

+-------+
|   Name|
+-------+
|    Ram|
|  Rohan|
|  Mohan|
|    Raj|
|   Neha|
|Prateek|
|  Mukul|
|  Disha|
|  Namit|
+-------+



In [18]:
#select two columns
df_pyspark.select(['Name', 'Experience']).show()

+-------+----------+
|   Name|Experience|
+-------+----------+
|    Ram|         1|
|  Rohan|         3|
|  Mohan|         4|
|    Raj|         2|
|   Neha|         2|
|Prateek|         5|
|  Mukul|         3|
|  Disha|         5|
|  Namit|         6|
+-------+----------+



In [19]:
#check data types
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [21]:
#Describe option
df_pyspark.describe().show()

+-------+-----+-----------------+------------------+
|summary| Name|              Age|        Experience|
+-------+-----+-----------------+------------------+
|  count|    9|                9|                 9|
|   mean| null|25.11111111111111|3.4444444444444446|
| stddev| null|2.472066162365221| 1.666666666666667|
|    min|Disha|               20|                 1|
|    max|Rohan|               28|                 6|
+-------+-----+-----------------+------------------+



In [23]:
#adding columns in dataframes
df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience']+2).show()

+-------+---+----------+------------------------+
|   Name|Age|Experience|Experience After 2 years|
+-------+---+----------+------------------------+
|    Ram| 28|         1|                       3|
|  Rohan| 27|         3|                       5|
|  Mohan| 25|         4|                       6|
|    Raj| 20|         2|                       4|
|   Neha| 24|         2|                       4|
|Prateek| 23|         5|                       7|
|  Mukul| 26|         3|                       5|
|  Disha| 26|         5|                       7|
|  Namit| 27|         6|                       8|
+-------+---+----------+------------------------+



In [24]:
# Drop the columns 
d = df_pyspark.drop("Experience After 2 years")

In [25]:
d.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|    Ram| 28|         1|
|  Rohan| 27|         3|
|  Mohan| 25|         4|
|    Raj| 20|         2|
|   Neha| 24|         2|
|Prateek| 23|         5|
|  Mukul| 26|         3|
|  Disha| 26|         5|
|  Namit| 27|         6|
+-------+---+----------+



In [26]:
#Rename the columns
n = df_pyspark.withColumnRenamed('Name', 'New Name')

In [27]:
n.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|     Ram| 28|         1|
|   Rohan| 27|         3|
|   Mohan| 25|         4|
|     Raj| 20|         2|
|    Neha| 24|         2|
| Prateek| 23|         5|
|   Mukul| 26|         3|
|   Disha| 26|         5|
|   Namit| 27|         6|
+--------+---+----------+

