<a href="https://colab.research.google.com/github/pranayb-konverge/pyspark-tutorial/blob/main/PySpark_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install pyspark

In [35]:
!pip install pyspark



In [36]:
import pyspark

In [37]:
import pandas as pd
pd.read_csv('pyspark_dummy_data.csv')

Unnamed: 0,Name,Age
0,Fair Glowach,34
1,Alberik McGuiness,35
2,Marys Coweuppe,30
3,Ursula Finlaison,35
4,Marchall Danslow,33
...,...,...
95,Gideon Stoll,30
96,Adelheid Wicks,35
97,Alastair Blasio,31
98,Glad MacClay,30


In [38]:
from pyspark.sql import SparkSession

In [39]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [40]:
spark

In [41]:
## read the data set
df_pyspark = spark.read.csv('pyspark_dummy_data2.csv', header=True,inferSchema=True)

In [42]:
df_pyspark

DataFrame[Name: string, Age: int, Experience: int]

In [43]:
# Check the Schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [44]:
# selecting column/s
df_pyspark.columns

['Name', 'Age', 'Experience']

In [45]:
df_pyspark.head(3)

[Row(Name='Meridith Marklow', Age=31, Experience=6),
 Row(Name='Rudie Kirkbride', Age=38, Experience=12),
 Row(Name='Gerik Kilrow', Age=41, Experience=15)]

In [46]:
df_pyspark.select('Name').head(3)

[Row(Name='Meridith Marklow'),
 Row(Name='Rudie Kirkbride'),
 Row(Name='Gerik Kilrow')]

In [47]:
df_pyspark.select(['Name','age']).head(3)

[Row(Name='Meridith Marklow', age=31),
 Row(Name='Rudie Kirkbride', age=38),
 Row(Name='Gerik Kilrow', age=41)]

In [48]:
# describe option similar to Pandas
df_pyspark.describe().show()

+-------+---------------+-----------------+----------------+
|summary|           Name|              Age|      Experience|
+-------+---------------+-----------------+----------------+
|  count|             50|               50|              50|
|   mean|           null|            36.54|            9.68|
| stddev|           null|4.612361385561674|3.12618956950627|
|    min|Aimee Retallick|               30|               5|
|    max|   Zabrina Titt|               45|              15|
+-------+---------------+-----------------+----------------+



In [49]:
# adding columns in data frame
df_pyspark = df_pyspark.withColumn('Experience after 2 years', df_pyspark['Experience'] + 2)

In [50]:
df_pyspark.show()

+------------------+---+----------+------------------------+
|              Name|Age|Experience|Experience after 2 years|
+------------------+---+----------+------------------------+
|  Meridith Marklow| 31|         6|                       8|
|   Rudie Kirkbride| 38|        12|                      14|
|      Gerik Kilrow| 41|        15|                      17|
|     Ileane Ablott| 31|        11|                      13|
|   Anthony Selland| 35|         8|                      10|
|  Jock Duckinfield| 43|         5|                       7|
|     Temple Latour| 37|        14|                      16|
|   Delphinia Arnet| 41|        10|                      12|
|  Langston Izakson| 43|         7|                       9|
|      Doria Figura| 30|        12|                      14|
|      Mellie Eyles| 32|        14|                      16|
|      Almire Bertl| 37|         8|                      10|
| Rollins Rignoldes| 30|         7|                       9|
|Jacklin Champerlen| 32|

In [51]:
# drop the columns
df_pyspark_dropped = df_pyspark.drop('Experience after 2 years')

In [52]:
df_pyspark_dropped.show()

+------------------+---+----------+
|              Name|Age|Experience|
+------------------+---+----------+
|  Meridith Marklow| 31|         6|
|   Rudie Kirkbride| 38|        12|
|      Gerik Kilrow| 41|        15|
|     Ileane Ablott| 31|        11|
|   Anthony Selland| 35|         8|
|  Jock Duckinfield| 43|         5|
|     Temple Latour| 37|        14|
|   Delphinia Arnet| 41|        10|
|  Langston Izakson| 43|         7|
|      Doria Figura| 30|        12|
|      Mellie Eyles| 32|        14|
|      Almire Bertl| 37|         8|
| Rollins Rignoldes| 30|         7|
|Jacklin Champerlen| 32|         9|
| Gretta Sprackling| 33|        11|
|     Gill Edgerton| 38|         5|
|  Jaquith Austwick| 39|        13|
|     Bartie Edwins| 40|        15|
|     Maurine Frude| 33|         8|
|    Tibold Norwell| 37|         9|
+------------------+---+----------+
only showing top 20 rows

