In [4]:
#build and run the pyspark session.
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [8]:
spark

In [10]:
# read the dataset
# we can add.show() to display all the content.
spark.read.option('header', 'true').csv('test1.csv')

DataFrame[Name: string, age: string, Experience: string, Salary: string]

In [42]:
df_pyspark = spark.read.option('header', 'true').csv('test1.csv', inferSchema = True)
#inferSchema tells the what type of data is stroing. like int, str.(if not mentioned, it takes as string )

In [12]:
## checking the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [13]:
#another way of reading the dataset.
df_pyspark = spark.read.csv('test1.csv', header = True, inferSchema = True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [15]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [16]:
type(df_pyspark)
#dataframe is a data structure, inside this we can perform various type of operations.This is also a one kind of data structure.

pyspark.sql.dataframe.DataFrame

In [17]:
# to get the cloumns
df_pyspark.columns

['Name', 'age', 'Experience', 'Salary']

In [18]:
df_pyspark.head(3)
# to get the top rows of data.

[Row(Name='Krish', age=31, Experience=10, Salary=30000),
 Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', age=29, Experience=4, Salary=20000)]

In [19]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [20]:
# from the above table when i want to pick up a particular column name.
df_pyspark.select('Name')

DataFrame[Name: string]

In [21]:
df_pyspark.select('Name').show()
# when i want the entire column.

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [23]:
#when i want 2 or multiple column names.
df_pyspark.select(['Name', 'age']).show()

+---------+---+
|     Name|age|
+---------+---+
|    Krish| 31|
|Sudhanshu| 30|
|    Sunny| 29|
|     Paul| 24|
|   Harsha| 21|
|  Shubham| 23|
+---------+---+



In [24]:
df_pyspark.select(['Name', 'age'])

DataFrame[Name: string, age: int]

In [26]:
type(df_pyspark.select(['Name', 'age']))

pyspark.sql.dataframe.DataFrame

In [27]:
#when i directly pick up the column name. the return type is column name.
df_pyspark['Name']

Column<'Name'>

In [29]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [31]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

In [33]:
df_pyspark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  NULL|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  NULL| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [43]:
# addding columns to the existing data frame.
df_pyspark = df_pyspark.withColumn('Experience after 3 years', df_pyspark['Experience']+3)

In [45]:
df_pyspark.show()

+---------+---+----------+------+------------------------+
|     Name|age|Experience|Salary|Experience after 3 years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                      13|
|Sudhanshu| 30|         8| 25000|                      11|
|    Sunny| 29|         4| 20000|                       7|
|     Paul| 24|         3| 20000|                       6|
|   Harsha| 21|         1| 15000|                       4|
|  Shubham| 23|         2| 18000|                       5|
+---------+---+----------+------+------------------------+



In [48]:
#droping the column
df_pyspark = df_pyspark.drop('Experience after 3 years')

In [50]:
df_pyspark.show()
# .show() is a functionality

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [52]:
# Rename the columns.
df_pyspark.withColumnRenamed('Name', 'New Name').show()

+---------+---+----------+------+
| New Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

