#### Tutorial 2-Pyspark With Python-Pyspark DataFrames- Part 1
#### In this session we will cover
- PySpark DataFrame
- Reading the Dataset
- Checking the Datatypes of the column(Schema)
- Selecting Columns And Indexing
- Check Describe option smilar to pandas
- Adding columns
- Dropping columns
- Renaming columns

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()
print(spark)
spark

<pyspark.sql.session.SparkSession object at 0x7fe1ff58d790>


In [0]:
### Read the dataset
df_pyspark = spark.read.option('header','true').csv('/FileStore/tables/test_data-1.csv')
df_pyspark.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|  Sushant| 23|         5|
|Dadasaheb| 24|         5|
|    Rahul| 22|         3|
|    Sagar| 22|         4|
|    Suraj| 23|         4|
|  Ranjeet| 23|         2|
|   Pramod| 23|         3|
|    Bhima| 24|         3|
+---------+---+----------+



In [0]:
  ### Check the schema
  df_pyspark.printSchema()
  ### here you can see all the column datatype is string, but it should not be the case, you can see new dataset reading in below cell :)

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [0]:
df_pyspark = spark.read.option('header','true').csv('/FileStore/tables/test_data-1.csv', inferSchema=True) 
# here you can see I have passed one more parameter in csv function as inferSchema=True that means it decides column datatype from the values from csv file
df_pyspark.printSchema() #printing the schema and check datatypeee :)

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [0]:
# we can do above two things like header and inferschema in csv function only as below
df_pyspark = spark.read.csv('/FileStore/tables/test_data-1.csv', header=True, inferSchema=True)
print(df_pyspark.printSchema())  # To show schema of dataframe
print(df_pyspark.show())         # To show actual data from dataframe 

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)

None
+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|  Sushant| 23|         5|
|Dadasaheb| 24|         5|
|    Rahul| 22|         3|
|    Sagar| 22|         4|
|    Suraj| 23|         4|
|  Ranjeet| 23|         2|
|   Pramod| 23|         3|
|    Bhima| 24|         3|
+---------+---+----------+

None


In [0]:
# Lets check the type of df_pyspark dataframe
print(type(df_pyspark))

<class 'pyspark.sql.dataframe.DataFrame'>


In [0]:
df_pyspark.columns # columns property of pyspark dataframe returns list of what all columns are there in dataframe

Out[13]: ['Name', 'Age', 'Experience']

In [0]:
df_pyspark.head(2) # head function of pyspark dataframe returns the no of first rows 

Out[15]: [Row(Name='Krish', Age=31, Experience=10),
 Row(Name='Sushant', Age=23, Experience=5)]

In [0]:
pf_pyspark.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|  Sushant| 23|         5|
|Dadasaheb| 24|         5|
|    Rahul| 22|         3|
|    Sagar| 22|         4|
|    Suraj| 23|         4|
|  Ranjeet| 23|         2|
|   Pramod| 23|         3|
|    Bhima| 24|         3|
+---------+---+----------+



In [0]:
name_df = df_pyspark.select('Name') # the select function of pyspark dataframe take column name and returns the specified colunn dataframe

In [0]:
name_df.show()

+---------+
|     Name|
+---------+
|    Krish|
|  Sushant|
|Dadasaheb|
|    Rahul|
|    Sagar|
|    Suraj|
|  Ranjeet|
|   Pramod|
|    Bhima|
+---------+



In [0]:
#what if I want to give multiple column names
new_df = df_pyspark.select(['Name','Experience'])
print(new_df.show())

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|  Sushant|         5|
|Dadasaheb|         5|
|    Rahul|         3|
|    Sagar|         4|
|    Suraj|         4|
|  Ranjeet|         2|
|   Pramod|         3|
|    Bhima|         3|
+---------+----------+

None


In [0]:
df_pyspark.dtypes # pyspark datafraem has one attribute named dtypes which returns the list of datatypes of all columns of dataframe

Out[25]: [('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [0]:
# Describe :- pyspark dataframe also has function called describe() like pandas which returns all description like mean, median, mode,... of df
df_describe = df_pyspark.describe() # describe fucntion returns a new dataframe of description like mean, median mode, sd,..
df_describe.show()

+-------+-------+-----------------+-----------------+
|summary|   Name|              Age|       Experience|
+-------+-------+-----------------+-----------------+
|  count|      9|                9|                9|
|   mean|   null|23.88888888888889|4.333333333333333|
| stddev|   null|2.758824226207808|2.345207879911715|
|    min|  Bhima|               22|                2|
|    max|Sushant|               31|               10|
+-------+-------+-----------------+-----------------+



In [0]:
### Adding new column to dataframe
new_df = df_pyspark.withColumn('Experience after 2 yrs',df_pyspark['Experience']+2)
new_df.show()

+---------+---+----------+----------------------+
|     Name|Age|Experience|Experience after 2 yrs|
+---------+---+----------+----------------------+
|    Krish| 31|        10|                    12|
|  Sushant| 23|         5|                     7|
|Dadasaheb| 24|         5|                     7|
|    Rahul| 22|         3|                     5|
|    Sagar| 22|         4|                     6|
|    Suraj| 23|         4|                     6|
|  Ranjeet| 23|         2|                     4|
|   Pramod| 23|         3|                     5|
|    Bhima| 24|         3|                     5|
+---------+---+----------+----------------------+



In [0]:
## Drop the column
new_df = df_pyspark.drop('Experience after 2 yrs')
new_df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|  Sushant| 23|         5|
|Dadasaheb| 24|         5|
|    Rahul| 22|         3|
|    Sagar| 22|         4|
|    Suraj| 23|         4|
|  Ranjeet| 23|         2|
|   Pramod| 23|         3|
|    Bhima| 24|         3|
+---------+---+----------+



In [0]:
 ### Rename the column
renamed_df = df_pyspark.withColumnRenamed('Name','New Name')
renamed_df.show()

+---------+---+----------+
| New Name|Age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|  Sushant| 23|         5|
|Dadasaheb| 24|         5|
|    Rahul| 22|         3|
|    Sagar| 22|         4|
|    Suraj| 23|         4|
|  Ranjeet| 23|         2|
|   Pramod| 23|         3|
|    Bhima| 24|         3|
+---------+---+----------+

