In [None]:
!pip install pyspark



In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("Practice2").getOrCreate() #starting the session

In [None]:
spark

Reading the dataset #Method1

In [None]:
df_pyspark = spark.read.option('header','true').csv('test1.csv')

Checking the schema

In [None]:
df_pyspark.printSchema()  #checking the datatype of the column

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



The above code displays even the columns "age" & "salary" to be of String type. Let's try to improve this:

In [None]:
df_pyspark = spark.read.option('header', 'true').csv('test1.csv', inferSchema = True)  #if we don't add inferSchema, by default it will consider all the features as string values

In [None]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



Reading the dataset #Method2 - including both header and inferSchema

In [None]:
df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema = True)

In [None]:
df_pyspark.show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
| Shivam| 31|        10| 30000|
|  Aashi| 30|         8| 25000|
| Monica| 29|         4| 20000|
|   Paul| 24|         3| 20000|
|Karnika| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [None]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [None]:
type(df_pyspark)  #a dataframe is a data structure inside which we can perform various operations

Getting all column names

In [None]:
df_pyspark.columns

['Name', 'age', 'Experience', 'Salary']

Getting some head elements + column names

In [None]:
df_pyspark.head(2)  #getting o/p in a list format

[Row(Name='Shivam', age=31, Experience=10, Salary=30000),
 Row(Name='Aashi', age=30, Experience=8, Salary=25000)]

Selecting/picking up a column #Method1

In [None]:
df_pyspark.select('Name') #selecting the 'Name' column

DataFrame[Name: string]

In the above example, return type is dataframe. Let's use show() to view the entire column

In [None]:
df_pyspark.select('Name').show()

+-------+
|   Name|
+-------+
| Shivam|
|  Aashi|
| Monica|
|   Paul|
|Karnika|
|Shubham|
+-------+



Checking the type of the dataframe

In [None]:
type(df_pyspark.select('Name'))

Selecting 2 columns

In [None]:
df_pyspark.select(['Name', 'Experience'])   #getting the dataframe with 2 features in o/p

DataFrame[Name: string, Experience: int]

In [None]:
df_pyspark.select(['Name', 'Experience']).show()

+-------+----------+
|   Name|Experience|
+-------+----------+
| Shivam|        10|
|  Aashi|         8|
| Monica|         4|
|   Paul|         3|
|Karnika|         1|
|Shubham|         2|
+-------+----------+



Selecting columns #Method2

In [None]:
df_pyspark['Name']  #directly picking, getting not much info in o/p

Column<'Name'>

Checking the datatypes of columns

In [None]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

Check the describe option similar to pandas

In [None]:
df_pyspark.describe()   #will give dataframe summary in o/p

DataFrame[summary: string, Name: string, age: string, Experience: string, Salary: string]

using show() with describe()

In [None]:
df_pyspark.describe().show()

+-------+-------+------------------+-----------------+------------------+
|summary|   Name|               age|       Experience|            Salary|
+-------+-------+------------------+-----------------+------------------+
|  count|      6|                 6|                6|                 6|
|   mean|   NULL|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|   NULL| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|  Aashi|                21|                1|             15000|
|    max|Shubham|                31|               10|             30000|
+-------+-------+------------------+-----------------+------------------+



Adding columns in dataframe

In [None]:
df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience'] + 2)

DataFrame[Name: string, age: int, Experience: int, Salary: int, Experience After 2 years: int]

In [None]:
df_pyspark.withColumn('Experience After 2 years', df_pyspark['Experience'] + 2).show()

+-------+---+----------+------+------------------------+
|   Name|age|Experience|Salary|Experience After 2 years|
+-------+---+----------+------+------------------------+
| Shivam| 31|        10| 30000|                      12|
|  Aashi| 30|         8| 25000|                      10|
| Monica| 29|         4| 20000|                       6|
|   Paul| 24|         3| 20000|                       5|
|Karnika| 21|         1| 15000|                       3|
|Shubham| 23|         2| 18000|                       4|
+-------+---+----------+------+------------------------+



Drop columns from the dataframe

In [None]:
df_pyspark.drop('age')

DataFrame[Name: string, Experience: int, Salary: int]

In [None]:
df_pyspark.drop('age').show()   #dropped the 'age' column

+-------+----------+------+
|   Name|Experience|Salary|
+-------+----------+------+
| Shivam|        10| 30000|
|  Aashi|         8| 25000|
| Monica|         4| 20000|
|   Paul|         3| 20000|
|Karnika|         1| 15000|
|Shubham|         2| 18000|
+-------+----------+------+



Renaming the column

In [None]:
df_pyspark.withColumnRenamed('Name', 'New Name')

DataFrame[New Name: string, age: int, Experience: int, Salary: int]

In [None]:
df_pyspark.withColumnRenamed('Name', 'New Name').show()   #renamed column 'Name' to 'New Name'

+--------+---+----------+------+
|New Name|age|Experience|Salary|
+--------+---+----------+------+
|  Shivam| 31|        10| 30000|
|   Aashi| 30|         8| 25000|
|  Monica| 29|         4| 20000|
|    Paul| 24|         3| 20000|
| Karnika| 21|         1| 15000|
| Shubham| 23|         2| 18000|
+--------+---+----------+------+

