## Installing PySpark

In [2]:
%pip install pyspark


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Loading PySpaRK And Starting

In [3]:
import pyspark

In [4]:
import pandas as pd
pd.read_csv('Dataset/test1.csv')

Unnamed: 0,Name,age,Experience
0,Krish,31,10
1,Sudhanshu,30,8
2,Sunny,29,4


In [5]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

24/01/10 11:12:20 WARN Utils: Your hostname, codespaces-078eac resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/01/10 11:12:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/10 11:12:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
spark

### Reading Dataset

In [8]:
#df_spark = spark.read.csv('Dataset/test1.csv')

## without Inferschema, all data will be of String datatype

#df_spark = spark.read.option('header','true').csv('Dataset/test1.csv',inferSchema = True)
df_spark = spark.read.csv('Dataset/test1.csv',inferSchema = True,header=True)


In [9]:
df_spark.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [10]:
type(df_spark)

## Dataframe is a datastructure in which various actions can be performed\

pyspark.sql.dataframe.DataFrame

In [11]:
# Check Schema
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



### Select

In [12]:
### Get columns
df_spark.columns

['Name', 'age', 'Experience']

In [13]:
### Get top 2 rows
df_spark.head(2)

#output is returned as a list , where as in pandas it is returned as dataframe

[Row(Name='Krish', age=31, Experience=10),
 Row(Name='Sudhanshu', age=30, Experience=8)]

In [14]:
### Get columns 
df_spark.select('Name')

df_spark.select(['Name','Experience'])

## Return type : DataFrame

DataFrame[Name: string, Experience: int]

In [15]:
### using this will return a column

df_spark['Name']

Column<'Name'>

### Check Datatype

In [16]:
df_spark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

### Describe

In [17]:
df_spark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| NULL|30.0|7.333333333333333|
| stddev| NULL| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|Sunny|  31|               10|
+-------+-----+----+-----------------+



                                                                                

### Adding and Removing

In [18]:
### Adding columns

df_spark = df_spark.withColumn('Experience after 2 years', df_spark['Experience']+2)

In [19]:
df_spark

DataFrame[Name: string, age: int, Experience: int, Experience after 2 years: int]

In [20]:
### Dropping columns

df_spark = df_spark.drop('Experience after 2 years')

In [21]:
### Renaming columns

df_spark = df_spark.withColumnRenamed('Name','New Name')

In [22]:
df_spark.show()

+---------+---+----------+
| New Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



## Dealing with Mising Values

In [37]:
df2_spark = spark.read.csv('Dataset/test2.csv',header=True,inferSchema=True)

In [24]:
df2_spark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



### Removing rows with null 

In [25]:
df2_spark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         2| 18000|
+---------+---+----------+------+



In [26]:
df2_spark.na.drop(how='all').show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [27]:
df2_spark.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         2| 18000|
+---------+---+----------+------+



In [28]:
df2_spark.na.drop(how="any",thresh=2).show()

## max 2 null values must be present

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [29]:
df2_spark.na.drop(how="any",subset=['Name']).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
+---------+----+----------+------+



In [30]:
df2_spark.na.fill('Missing Values','Experience').show()

+---------+----+--------------+------+
|     Name| Age|    Experience|Salary|
+---------+----+--------------+------+
|    Krish|  31|            10| 30000|
|Sudhanshu|  30|             8| 25000|
|    Sunny|  29|             4| 20000|
|     Paul|  24|             3| 20000|
|   Harsha|  21|             2| 18000|
|   Mahesh|NULL|Missing Values| 40000|
|     NULL|  34|            10| 38000|
|     NULL|  36|Missing Values|  NULL|
+---------+----+--------------+------+



In [38]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("mean")

In [39]:
imputer.fit(df2_spark).transform(df2_spark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         2| 18000|         21|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         29|                 6|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 6|         27285|
+---------+----+----------+------+-----------+------------------+--------------+

