In [2]:
!pip install pyspark



In [9]:
import pyspark
import pandas as pd

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Everytime we code in pyspark, we have to create a spark session

In [10]:
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

On the cloud, we can create multiple instances of the spark and work in parallel. 

In [12]:
spark

In [14]:
df_pyspark = spark.read.csv('/content/drive/MyDrive/PySpark/Sample data - Sheet1.csv')

In [16]:
df_pyspark

DataFrame[_c0: string, _c1: string]

In [17]:
df_pyspark.show()

+---------+---+
|      _c0|_c1|
+---------+---+
|     Name|Age|
|Sudhanshu| 23|
|    Sunny| 45|
|    Mohit| 67|
+---------+---+



We want to make name and age as the column name

In [20]:
df_pyspark = spark.read.option('header', 'true').csv('/content/drive/MyDrive/PySpark/Sample data - Sheet1.csv') ### first column will be considered header 

In [21]:
df_pyspark.show()

+---------+---+
|     Name|Age|
+---------+---+
|Sudhanshu| 23|
|    Sunny| 45|
|    Mohit| 67|
+---------+---+



In [22]:
type(df_pyspark) ### the data type is not pandas but sql pandas

pyspark.sql.dataframe.DataFrame

In [23]:
df_pyspark.head()

Row(Name='Sudhanshu', Age='23')

In [24]:
df_pyspark.printSchema() ### it's like df.info()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)



**Basic functions: Part 1**

In [30]:
df = spark.read.option('header', 'true').csv('/content/drive/MyDrive/PySpark/Sample data - Sheet1(1).csv', inferSchema = True)

In [31]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|Sudhanshu| 23|        10|
|    Sunny| 45|         8|
|    Mohit| 67|         4|
+---------+---+----------+



In [32]:
type(df)

pyspark.sql.dataframe.DataFrame

In [33]:
### checking the schema ###

df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



By default the numbers are being read as string. We have to give another option of infer schema. 

In [None]:
### another way to do it ####

In [54]:
df = spark.read.csv('/content/drive/MyDrive/PySpark/Sample data - Sheet1(1).csv', header = True, inferSchema=True)

In [35]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|Sudhanshu| 23|        10|
|    Sunny| 45|         8|
|    Mohit| 67|         4|
+---------+---+----------+



In [36]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [37]:
### column names ####

df.columns

['Name', 'Age', 'Experience']

In [39]:
### pick up some head elements ###

df.head(2)

[Row(Name='Sudhanshu', Age=23, Experience=10),
 Row(Name='Sunny', Age=45, Experience=8)]

In [41]:
### selecting a column ###

df.select('Name').show()

+---------+
|     Name|
+---------+
|Sudhanshu|
|    Sunny|
|    Mohit|
+---------+



In [43]:
### pick up multiple columns ###

df.select(['Name', 'Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|Sudhanshu|        10|
|    Sunny|         8|
|    Mohit|         4|
+---------+----------+



In [47]:
df.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [49]:
df.describe().show()

+-------+-----+----+-----------------+
|summary| Name| Age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| null|45.0|7.333333333333333|
| stddev| null|22.0|3.055050463303893|
|    min|Mohit|  23|                4|
|    max|Sunny|  67|               10|
+-------+-----+----+-----------------+



In [55]:
#### adding columns in dataframe #####

df = df.withColumn('Experience After 2 years', df['Experience'] + 2)

In [56]:
df.show()

+---------+---+----------+------------------------+
|     Name|Age|Experience|Experience After 2 years|
+---------+---+----------+------------------------+
|Sudhanshu| 23|        10|                      12|
|    Sunny| 45|         8|                      10|
|    Mohit| 67|         4|                       6|
+---------+---+----------+------------------------+



In [59]:
### drop the columns ###

df = df.drop('Experience After 2 years')

In [60]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|Sudhanshu| 23|        10|
|    Sunny| 45|         8|
|    Mohit| 67|         4|
+---------+---+----------+



In [61]:
### rename the columns ###

df.withColumnRenamed('Name', 'New Name').show()

+---------+---+----------+
| New Name|Age|Experience|
+---------+---+----------+
|Sudhanshu| 23|        10|
|    Sunny| 45|         8|
|    Mohit| 67|         4|
+---------+---+----------+



In [62]:
df.show()

+---------+---+----------+
|     Name|Age|Experience|
+---------+---+----------+
|Sudhanshu| 23|        10|
|    Sunny| 45|         8|
|    Mohit| 67|         4|
+---------+---+----------+



In [65]:
#### reading the file again ####

df = spark.read.csv('/content/drive/MyDrive/PySpark/Sample data - Sheet1.csv', header = True, inferSchema=True)

In [66]:
df.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Sudhanshu| 23|        10|  2000|
|    Sunny| 45|         8|  3445|
|    Mohit| 67|         4|  3400|
|   Harsha| 34|         1|  1200|
|  Shubham| 54|        13|  3211|
|   Mahesh| 23|         4|  2355|
|     Paul| 68|         6|  3230|
|     null| 34|        10|  3500|
|     null| 23|      null|  null|
+---------+---+----------+------+



In [67]:
### drop a column ###

df.drop('Name').show()

+---+----------+------+
|Age|Experience|Salary|
+---+----------+------+
| 23|        10|  2000|
| 45|         8|  3445|
| 67|         4|  3400|
| 34|         1|  1200|
| 54|        13|  3211|
| 23|         4|  2355|
| 68|         6|  3230|
| 34|        10|  3500|
| 23|      null|  null|
+---+----------+------+



In [68]:
### drop all the rows with null values ###

df.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Sudhanshu| 23|        10|  2000|
|    Sunny| 45|         8|  3445|
|    Mohit| 67|         4|  3400|
|   Harsha| 34|         1|  1200|
|  Shubham| 54|        13|  3211|
|   Mahesh| 23|         4|  2355|
|     Paul| 68|         6|  3230|
+---------+---+----------+------+



In [69]:
#### any == how

df.na.drop(how = 'any').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Sudhanshu| 23|        10|  2000|
|    Sunny| 45|         8|  3445|
|    Mohit| 67|         4|  3400|
|   Harsha| 34|         1|  1200|
|  Shubham| 54|        13|  3211|
|   Mahesh| 23|         4|  2355|
|     Paul| 68|         6|  3230|
+---------+---+----------+------+



In [70]:
df.na.drop(how = 'all').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Sudhanshu| 23|        10|  2000|
|    Sunny| 45|         8|  3445|
|    Mohit| 67|         4|  3400|
|   Harsha| 34|         1|  1200|
|  Shubham| 54|        13|  3211|
|   Mahesh| 23|         4|  2355|
|     Paul| 68|         6|  3230|
|     null| 34|        10|  3500|
|     null| 23|      null|  null|
+---------+---+----------+------+

