In [1]:
import pyspark
import pandas as pd

In [2]:
pd.read_csv('Untitled.csv')  ## READING FILE THROUGH PANDAS

Unnamed: 0,Name,Age
0,Subham,34
1,Soham,53
2,Swarnali,32
3,Tanvir,23


In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Trial_Session').getOrCreate() ## CREATING SPARK SESSION

In [5]:
spark

In [6]:
df_pyspark = spark.read.csv('Untitled.csv')  ## READING FILE THROUGH PYSPARK
df_pyspark

DataFrame[_c0: string, _c1: string]

In [7]:
type(df_pyspark)  ## type of dataframe

pyspark.sql.dataframe.DataFrame

In [8]:
df_pyspark.head(5)

[Row(_c0='Name', _c1='Age'),
 Row(_c0='Subham', _c1='34'),
 Row(_c0='Soham', _c1='53'),
 Row(_c0='Swarnali', _c1='32'),
 Row(_c0='Tanvir', _c1='23')]

In [9]:
df_pyspark.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



In [10]:
df_pyspark.show() ## TO SHOW THE DATABASE

+--------+---+
|     _c0|_c1|
+--------+---+
|    Name|Age|
|  Subham| 34|
|   Soham| 53|
|Swarnali| 32|
|  Tanvir| 23|
+--------+---+



In [11]:
df_pyspark2 =spark.read.option('header','true').csv("Untitled.csv") ## MAKING THE COLUMN NAMES AS MAIN COLUMN

In [12]:
type(df_pyspark2)

pyspark.sql.dataframe.DataFrame

In [13]:
spark.read.option('header','true').csv("Untitled.csv").show()

+--------+---+
|    Name|Age|
+--------+---+
|  Subham| 34|
|   Soham| 53|
|Swarnali| 32|
|  Tanvir| 23|
+--------+---+



In [17]:
df_s = spark.read.option('header','true').csv("Untitled2.csv",inferSchema = True)
df_s.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|  Subham| 34|        23|
|   Soham| 53|         6|
|Swarnali| 32|         4|
|  Tanvir| 23|         3|
+--------+---+----------+



In [18]:
## CHECK THE SCHEMA/ DATA TYPE

df_s.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [19]:
df_s = spark.read.csv('Untitled2.csv',header=True,inferSchema=True) 

In [20]:
df_s

DataFrame[Name: string, Age: int, Experience: int]

In [21]:
type(df_s)

pyspark.sql.dataframe.DataFrame

In [22]:
df_s.columns ## Get column names

['Name', 'Age', 'Experience']

In [25]:
df_s.show() ## Get all the columns information

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|  Subham| 34|        23|
|   Soham| 53|         6|
|Swarnali| 32|         4|
|  Tanvir| 23|         3|
+--------+---+----------+



In [26]:
df_s.select("Name").show() ## Get a specific column information

+--------+
|    Name|
+--------+
|  Subham|
|   Soham|
|Swarnali|
|  Tanvir|
+--------+



In [27]:
df_s.select("Name","Experience").show() ## Get multiple columns information

+--------+----------+
|    Name|Experience|
+--------+----------+
|  Subham|        23|
|   Soham|         6|
|Swarnali|         4|
|  Tanvir|         3|
+--------+----------+



In [30]:
df_s.dtypes ## CHECKING DATA TYPES

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [33]:
df_s.describe().show()   ## Same function as pandas

+-------+------+-----------------+----------------+
|summary|  Name|              Age|      Experience|
+-------+------+-----------------+----------------+
|  count|     4|                4|               4|
|   mean|  NULL|             35.5|             9.0|
| stddev|  NULL|12.60952021291849|9.41629792788369|
|    min| Soham|               23|               3|
|    max|Tanvir|               53|              23|
+-------+------+-----------------+----------------+



In [36]:
### Adding Columns

df_s_new = df_s.withColumn('Experience After Two Years', df_s['Experience']+2)

In [39]:
df_s_new.show()


+--------+---+----------+--------------------------+
|    Name|Age|Experience|Experience After Two Years|
+--------+---+----------+--------------------------+
|  Subham| 34|        23|                        25|
|   Soham| 53|         6|                         8|
|Swarnali| 32|         4|                         6|
|  Tanvir| 23|         3|                         5|
+--------+---+----------+--------------------------+



In [42]:
### Dropping Columns

df_s_new2 = df_s_new.drop('Experience After Two Years')
df_s_new2.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|  Subham| 34|        23|
|   Soham| 53|         6|
|Swarnali| 32|         4|
|  Tanvir| 23|         3|
+--------+---+----------+



In [43]:
### Renaming Columns
df_s_new2.withColumnRenamed('Name','New Name').show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|  Subham| 34|        23|
|   Soham| 53|         6|
|Swarnali| 32|         4|
|  Tanvir| 23|         3|
+--------+---+----------+



In [45]:
df_s_new =spark.read.csv("Untitled3.csv",header=True,inferSchema=True)
df_s_new.show()

+--------+----+----------+-------+
|    Name| Age|Experience| Salary|
+--------+----+----------+-------+
|  Subham|  34|        23|  50000|
|   Soham|  53|         6|   3323|
|Swarnali|  32|         4|  21421|
|  Tanvir|  23|         3|  23134|
|     Sam|  13|         2|  43432|
|  Ronnie|  43|        33| 654345|
|     Ram|NULL|      NULL| 245243|
|    NULL|  87|        45|1000000|
|    NULL|  56|      NULL|   NULL|
+--------+----+----------+-------+



In [48]:
### Drop the Rows which have Null Values

df_s_new.na.drop().show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|  Subham| 34|        23| 50000|
|   Soham| 53|         6|  3323|
|Swarnali| 32|         4| 21421|
|  Tanvir| 23|         3| 23134|
|     Sam| 13|         2| 43432|
|  Ronnie| 43|        33|654345|
+--------+---+----------+------+



In [52]:
### Drop the Rows which have Null Values

df_s_new.na.drop(how="any").show() # Will drop the rows which have any number of null values

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|  Subham| 34|        23| 50000|
|   Soham| 53|         6|  3323|
|Swarnali| 32|         4| 21421|
|  Tanvir| 23|         3| 23134|
|     Sam| 13|         2| 43432|
|  Ronnie| 43|        33|654345|
+--------+---+----------+------+



In [51]:
### Threshold

df_s_new.na.drop(how="any",thresh=2).show() ## Thresh = 2 means that at least 2 non-null values should be present in the rows

+--------+----+----------+-------+
|    Name| Age|Experience| Salary|
+--------+----+----------+-------+
|  Subham|  34|        23|  50000|
|   Soham|  53|         6|   3323|
|Swarnali|  32|         4|  21421|
|  Tanvir|  23|         3|  23134|
|     Sam|  13|         2|  43432|
|  Ronnie|  43|        33| 654345|
|     Ram|NULL|      NULL| 245243|
|    NULL|  87|        45|1000000|
+--------+----+----------+-------+



In [53]:
### Subset

df_s_new.na.drop(how="any",subset=['Experience']).show() ## The entire row will be deleted whenver there is a null value in that user-defined column

+--------+---+----------+-------+
|    Name|Age|Experience| Salary|
+--------+---+----------+-------+
|  Subham| 34|        23|  50000|
|   Soham| 53|         6|   3323|
|Swarnali| 32|         4|  21421|
|  Tanvir| 23|         3|  23134|
|     Sam| 13|         2|  43432|
|  Ronnie| 43|        33| 654345|
|    NULL| 87|        45|1000000|
+--------+---+----------+-------+

