In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pyspark


In [3]:
import pandas as pd

In [4]:
pd.read_csv(r'C:\Users\sivasu\Downloads\test.csv')

Unnamed: 0,Name,Age
0,Arun,30
1,Sunny,31
2,Ben,26


In [6]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [8]:
spark

In [10]:
df_pyspark = spark.read.csv(r'C:\Users\sivasu\Downloads\test.csv')

In [17]:
df_pyspark.show()

+-----+---+
|  _c0|_c1|
+-----+---+
| Name|Age|
| Arun| 30|
|Sunny| 31|
|  Ben| 26|
+-----+---+



In [19]:
df_pyspark = spark.read.option('header','true').csv(r'C:\Users\sivasu\Downloads\test.csv')

In [20]:
df_pyspark.show()

+-----+---+
| Name|Age|
+-----+---+
| Arun| 30|
|Sunny| 31|
|  Ben| 26|
+-----+---+



In [21]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [23]:
df_pyspark.head(3)

[Row(Name='Arun', Age='30'),
 Row(Name='Sunny', Age='31'),
 Row(Name='Ben', Age='26')]

In [24]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)



## Tutorial

In [25]:
from pyspark.sql import SparkSession

In [26]:
spark = SparkSession.builder.appName('practicee').getOrCreate()

In [27]:
spark

#### First Method

In [37]:
## Reading the dataset
spark.read.option('header', 'true').csv(r'C:\Users\sivasu\Downloads\test.csv', inferSchema = True)

DataFrame[Name: string, Age: int, Experience: int]

In [40]:
df = spark.read.option('header', 'true').csv(r'C:\Users\sivasu\Downloads\test.csv', inferSchema = True)

In [41]:
df.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| Arun| 30|        10|
|Sunny| 31|         8|
|  Ben| 26|         5|
+-----+---+----------+



In [42]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



#### Second method

In [43]:
df = spark.read.csv(r'C:\Users\sivasu\Downloads\test.csv', header = True, inferSchema = True)

In [44]:
df.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| Arun| 30|        10|
|Sunny| 31|         8|
|  Ben| 26|         5|
+-----+---+----------+



In [45]:
type(df)

pyspark.sql.dataframe.DataFrame

In [46]:
## Checking the datatypes of Columns
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [48]:
df.head(3)

[Row(Name='Arun', Age=30, Experience=10),
 Row(Name='Sunny', Age=31, Experience=8),
 Row(Name='Ben', Age=26, Experience=5)]

In [49]:
df.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| Arun| 30|        10|
|Sunny| 31|         8|
|  Ben| 26|         5|
+-----+---+----------+



In [54]:
df.select('Name', 'Experience').show()

+-----+----------+
| Name|Experience|
+-----+----------+
| Arun|        10|
|Sunny|         8|
|  Ben|         5|
+-----+----------+



In [58]:
df.describe().show()

+-------+-----+------------------+------------------+
|summary| Name|               Age|        Experience|
+-------+-----+------------------+------------------+
|  count|    3|                 3|                 3|
|   mean| null|              29.0| 7.666666666666667|
| stddev| null|2.6457513110645907|2.5166114784235836|
|    min| Arun|                26|                 5|
|    max|Sunny|                31|                10|
+-------+-----+------------------+------------------+



In [61]:
## Adding columns in pyspark DataFrame
df = df.withColumn('Experience After 2 Years', df['Experience'] + 2)

In [62]:
df.show()

+-----+---+----------+------------------------+
| Name|Age|Experience|Experience After 2 Years|
+-----+---+----------+------------------------+
| Arun| 30|        10|                      12|
|Sunny| 31|         8|                      10|
|  Ben| 26|         5|                       7|
+-----+---+----------+------------------------+



In [64]:
## Dropping a column
df = df.drop('Experience After 2 Years')

In [65]:
df.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
| Arun| 30|        10|
|Sunny| 31|         8|
|  Ben| 26|         5|
+-----+---+----------+



In [67]:
## Renaming a column
df = df.withColumnRenamed('Name', 'New Name')

In [68]:
df.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|    Arun| 30|        10|
|   Sunny| 31|         8|
|     Ben| 26|         5|
+--------+---+----------+



### Handling Missing Values

In [69]:
from pyspark.sql import SparkSession

In [70]:
spark = SparkSession.builder.appName('part_three').getOrCreate()

In [72]:
df = spark.read.csv(r'C:\Users\sivasu\Downloads\test.csv', header = True, inferSchema = True)

In [73]:
df.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|  Arun|  30|        10| 30000|
| Sunny|  31|         8| 25000|
|   Ben|  26|         5| 20000|
|  Paul|  24|         4| 20000|
|Harsha|  21|         1| 15000|
|Mahesh|  23|         2| 18000|
| Anand|null|      null| 40000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [74]:
## Dropping a column
df.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  30|        10| 30000|
|  31|         8| 25000|
|  26|         5| 20000|
|  24|         4| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [75]:
## Droping the rows with NULL values
df.na.drop().show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Arun| 30|        10| 30000|
| Sunny| 31|         8| 25000|
|   Ben| 26|         5| 20000|
|  Paul| 24|         4| 20000|
|Harsha| 21|         1| 15000|
|Mahesh| 23|         2| 18000|
+------+---+----------+------+



In [76]:
## how = any Or all
## any drops a row if it contains any null
## all frops a row if the all the values in a row is null
df.na.drop(how='any').show()


+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Arun| 30|        10| 30000|
| Sunny| 31|         8| 25000|
|   Ben| 26|         5| 20000|
|  Paul| 24|         4| 20000|
|Harsha| 21|         1| 15000|
|Mahesh| 23|         2| 18000|
+------+---+----------+------+



In [77]:
df.na.drop(how='any', thresh = 2).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|  Arun|  30|        10| 30000|
| Sunny|  31|         8| 25000|
|   Ben|  26|         5| 20000|
|  Paul|  24|         4| 20000|
|Harsha|  21|         1| 15000|
|Mahesh|  23|         2| 18000|
| Anand|null|      null| 40000|
|  null|  34|        10| 38000|
+------+----+----------+------+



In [78]:
df.na.drop(how='any', subset = ['Experience']).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Arun| 30|        10| 30000|
| Sunny| 31|         8| 25000|
|   Ben| 26|         5| 20000|
|  Paul| 24|         4| 20000|
|Harsha| 21|         1| 15000|
|Mahesh| 23|         2| 18000|
|  null| 34|        10| 38000|
+------+---+----------+------+



In [84]:
## Filling the Missing values for all the columns -- Not working here
df.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|          Arun|  30|        10| 30000|
|         Sunny|  31|         8| 25000|
|           Ben|  26|         5| 20000|
|          Paul|  24|         4| 20000|
|        Harsha|  21|         1| 15000|
|        Mahesh|  23|         2| 18000|
|         Anand|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



In [85]:
df.na.fill('n/a').show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|  Arun|  30|        10| 30000|
| Sunny|  31|         8| 25000|
|   Ben|  26|         5| 20000|
|  Paul|  24|         4| 20000|
|Harsha|  21|         1| 15000|
|Mahesh|  23|         2| 18000|
| Anand|null|      null| 40000|
|   n/a|  34|        10| 38000|
|   n/a|  36|      null|  null|
+------+----+----------+------+



In [89]:
df.na.fill(0).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Arun| 30|        10| 30000|
| Sunny| 31|         8| 25000|
|   Ben| 26|         5| 20000|
|  Paul| 24|         4| 20000|
|Harsha| 21|         1| 15000|
|Mahesh| 23|         2| 18000|
| Anand|  0|         0| 40000|
|  null| 34|        10| 38000|
|  null| 36|         0|     0|
+------+---+----------+------+



In [82]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [94]:
from pyspark.ml.feature import Imputer

In [96]:
Imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]).setStrategy("mean")

In [98]:
Imputer.fit(df).transform(df).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
|  Arun|  30|        10| 30000|         30|                10|         30000|
| Sunny|  31|         8| 25000|         31|                 8|         25000|
|   Ben|  26|         5| 20000|         26|                 5|         20000|
|  Paul|  24|         4| 20000|         24|                 4|         20000|
|Harsha|  21|         1| 15000|         21|                 1|         15000|
|Mahesh|  23|         2| 18000|         23|                 2|         18000|
| Anand|null|      null| 40000|         28|                 5|         40000|
|  null|  34|        10| 38000|         34|                10|         38000|
|  null|  36|      null|  null|         36|                 5|         25750|
+------+----+----------+------+-----------+------------------+--

### Filter Operations

In [101]:
df = df.na.drop()

In [102]:
df.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Arun| 30|        10| 30000|
| Sunny| 31|         8| 25000|
|   Ben| 26|         5| 20000|
|  Paul| 24|         4| 20000|
|Harsha| 21|         1| 15000|
|Mahesh| 23|         2| 18000|
+------+---+----------+------+



In [105]:
### Salary less than or equal to 20000
df.filter('Salary <= 20000').select('Name', 'Age').show()

+------+---+
|  Name|Age|
+------+---+
|   Ben| 26|
|  Paul| 24|
|Harsha| 21|
|Mahesh| 23|
+------+---+



In [109]:
df.filter((df['Salary']<= 20000) & (df['Salary']>= 15000)).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|   Ben| 26|         5| 20000|
|  Paul| 24|         4| 20000|
|Harsha| 21|         1| 15000|
|Mahesh| 23|         2| 18000|
+------+---+----------+------+



In [111]:
## Not operation/ Negation
df.filter(~(df['Salary'] <= 20000)).show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
| Arun| 30|        10| 30000|
|Sunny| 31|         8| 25000|
+-----+---+----------+------+



In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Daytwo').getOrCreate()

## Group By and Aggregations

In [4]:
df = spark.read.csv(r'C:\Users\sivasu\Downloads\test.csv', header =True, inferSchema = True)

In [5]:
df.show()

+------+------------+------+
|  Name|  Department|Salary|
+------+------------+------+
|  Arun|Data Science| 30000|
|  Arun|         IOT| 25000|
| Anand|    Big Data| 20000|
|  Arun|    Big Data| 20000|
| Anand|Data Science| 15000|
|Mahesh|Data Science| 18000|
|Mahesh|         IOT| 40000|
|Mahesh|    Big Data| 38000|
| Sunny|Data Science|  5000|
| Sunny|    Big Data|  8000|
+------+------------+------+



In [6]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [28]:
## Group By
df.groupBy('Name').sum('Salary').show()

+------+-----------+
|  Name|sum(Salary)|
+------+-----------+
|  Arun|      75000|
| Sunny|      13000|
| Anand|      35000|
|Mahesh|      96000|
+------+-----------+



In [13]:
df.groupBy('Department').sum('Salary').show()

+------------+-----------+
|  Department|sum(Salary)|
+------------+-----------+
|         IOT|      65000|
|    Big Data|      86000|
|Data Science|      68000|
+------------+-----------+



In [30]:
df.groupBy('Department').count().show()

+------------+-----+
|  Department|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [31]:
df.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|     219000|
+-----------+



In [32]:
df = spark.read.csv(r'C:\Users\sivasu\Downloads\test.csv', header = True, inferSchema = True)

In [33]:
df.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Arun| 42|        12| 30000|
| Krish| 34|         8| 25000|
| Vansh| 36|         9| 20000|
|  Matt| 28|         4| 19000|
| Anand| 24|         1| 15000|
|Mahesh| 31|         6| 18000|
+------+---+----------+------+



In [34]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [35]:
df.columns

['Name', 'Age', 'Experience', 'Salary']