In [11]:
!pip install pyspark

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [1]:
import pyspark
import pandas as pd

In [2]:
pd.read_csv('dataset.csv')

Unnamed: 0,Name,Age,Experience,Salary
0,Krish,31.0,10.0,30000.0
1,Sudhanshu,30.0,8.0,25000.0
2,Sunny,29.0,6.0,15000.0
3,Ishan,32.0,4.0,10000.0
4,Nishan,26.0,,
5,Rahamatullah,62.0,60.0,
6,,,,


In [3]:
# we have to create a spark session
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [6]:
spark

In [7]:
# Try to read a dataset with respect to spark
df_pyspark = spark.read.csv('dataset.csv')

In [8]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string]

In [9]:
df_pyspark.show()

+------------+----+----------+------+
|         _c0| _c1|       _c2|   _c3|
+------------+----+----------+------+
|        Name| Age|Experience|Salary|
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [10]:
spark.read.option('header','true').csv('dataset.csv')

DataFrame[Name: string, Age: string, Experience: string, Salary: string]

In [11]:
df_pyspark = spark.read.option('header','true').csv('dataset.csv').show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [12]:
df_pyspark = spark.read.option('header','true').csv('dataset.csv')

In [13]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [14]:
df_pyspark.head(3)

[Row(Name='Krish', Age='31', Experience='10', Salary='30000'),
 Row(Name='Sudhanshu', Age='30', Experience='8', Salary='25000'),
 Row(Name='Sunny', Age='29', Experience='6', Salary='15000')]

In [15]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



## Pyspark Data Frame - part1

In [16]:
from pyspark.sql import SparkSession

In [17]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [18]:
spark

In [19]:
## read dataset
df_pyspark = spark.read.option('header','true').csv('dataset.csv', inferSchema = True)

In [20]:
## check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [21]:
df_pyspark = spark.read.csv('dataset.csv', header = True, inferSchema = True)
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [22]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [23]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [24]:
df_pyspark.columns

['Name', 'Age', 'Experience', 'Salary']

In [25]:
df_pyspark.head(3)

[Row(Name='Krish', Age='31', Experience='10', Salary='30000'),
 Row(Name='Sudhanshu', Age='30', Experience='8', Salary='25000'),
 Row(Name='Sunny', Age='29', Experience='6', Salary='15000')]

In [26]:
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [27]:
df_pyspark.select('Name').show()

+------------+
|        Name|
+------------+
|       Krish|
|   Sudhanshu|
|       Sunny|
|       Ishan|
|      Nishan|
|Rahamatullah|
|        NULL|
+------------+



In [28]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [29]:
df_pyspark.select(['Name','Experience']).show()

+------------+----------+
|        Name|Experience|
+------------+----------+
|       Krish|        10|
|   Sudhanshu|         8|
|       Sunny|         6|
|       Ishan|         4|
|      Nishan|      null|
|Rahamatullah|        60|
|        NULL|      NULL|
+------------+----------+



In [30]:
df_pyspark.dtypes

[('Name', 'string'),
 ('Age', 'string'),
 ('Experience', 'string'),
 ('Salary', 'string')]

In [31]:
df_pyspark.describe().show()

+-------+-----+------------------+------------------+-----------------+
|summary| Name|               Age|        Experience|           Salary|
+-------+-----+------------------+------------------+-----------------+
|  count|    7|                 7|                 6|                5|
|   mean| null|              35.0|              17.6|          20000.0|
| stddev| null|13.386560424545209|23.807561823924768|9128.709291752768|
|    min|Ishan|                26|                10|            10000|
|    max|Sunny|              NULL|              NULL|             NULL|
+-------+-----+------------------+------------------+-----------------+



In [32]:
### Adding columns in DataFrame
df_pyspark = df_pyspark.withColumn('Experience After 2 years',df_pyspark['Experience']+2)

In [33]:
df_pyspark.show()

+------------+----+----------+------+------------------------+
|        Name| Age|Experience|Salary|Experience After 2 years|
+------------+----+----------+------+------------------------+
|       Krish|  31|        10| 30000|                    12.0|
|   Sudhanshu|  30|         8| 25000|                    10.0|
|       Sunny|  29|         6| 15000|                     8.0|
|       Ishan|  32|         4| 10000|                     6.0|
|      Nishan|  26|      null|  null|                    null|
|Rahamatullah|  62|        60|  null|                    62.0|
|        NULL|NULL|      NULL|  NULL|                    null|
+------------+----+----------+------+------------------------+



In [34]:
### Drop the columns
df_pyspark = df_pyspark.drop('Experience after 2 years')

In [35]:
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [36]:
### rename the column
df_pyspark.withColumnRenamed('Name','New Name').show()

+------------+----+----------+------+
|    New Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



## Pyspark Handling Missing Values

In [37]:
from pyspark.sql import SparkSession

In [38]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [39]:
spark

In [40]:
df_pyspark = spark.read.csv('dataset.csv', header = True, inferSchema = True)

In [41]:
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [42]:
## dropping row
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         6| 15000|
|  32|         4| 10000|
|  26|      null|  null|
|  62|        60|  null|
|NULL|      NULL|  NULL|
+----+----------+------+



In [43]:
## Dropping a row where null values are present
df_pyspark.na.drop().show()
#df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         6| 15000|
|    Ishan|  32|         4| 10000|
|     NULL|NULL|      NULL|  NULL|
+---------+----+----------+------+



In [44]:
### any = how
df_pyspark.show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [45]:
df_pyspark.na.drop(how = "all").show()

+------------+----+----------+------+
|        Name| Age|Experience|Salary|
+------------+----+----------+------+
|       Krish|  31|        10| 30000|
|   Sudhanshu|  30|         8| 25000|
|       Sunny|  29|         6| 15000|
|       Ishan|  32|         4| 10000|
|      Nishan|  26|      null|  null|
|Rahamatullah|  62|        60|  null|
|        NULL|NULL|      NULL|  NULL|
+------------+----+----------+------+



In [46]:
## threshold
