<a href="https://colab.research.google.com/github/sahug/pyspark/blob/main/Spark%20-%20PySpark%20-%20Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Spark - PySpark - Basics**

# Install PySpark

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Create SparkSession

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [None]:
spark

# Read Dataset

In [None]:
df_pyspark = spark.read.csv("/content/movies.csv")

In [None]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string]

# Read dataset with headers

In [None]:
df_pyspark = spark.read.option("header", "true").csv("/content/movies.csv")

In [None]:
df_pyspark

DataFrame[movieId: string, title: string, genres: string]

In [None]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

# View entire dataset

In [None]:
df_pyspark = spark.read.option("header", "true").csv("/content/movies.csv").show()
df_pyspark

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

# Check schema

In [None]:
df_pyspark = spark.read.option("header", "true").csv("/content/movies.csv")
df_pyspark.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
df_pyspark = spark.read.csv("/content/movies.csv", header=True, inferSchema=True)
df_pyspark.show()
df_pyspark.printSchema()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

# Get columns

In [None]:
df_pyspark.head()

Row(movieId=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy')

In [None]:
df_pyspark.head(3)

[Row(movieId=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(movieId=2, title='Jumanji (1995)', genres='Adventure|Children|Fantasy'),
 Row(movieId=3, title='Grumpier Old Men (1995)', genres='Comedy|Romance')]

In [None]:
df_pyspark.select("title")

DataFrame[title: string]

In [None]:
df_pyspark["title"]

Column<'title'>

In [None]:
df_pyspark.select("title").show()

+--------------------+
|               title|
+--------------------+
|    Toy Story (1995)|
|      Jumanji (1995)|
|Grumpier Old Men ...|
|Waiting to Exhale...|
|Father of the Bri...|
|         Heat (1995)|
|      Sabrina (1995)|
| Tom and Huck (1995)|
| Sudden Death (1995)|
|    GoldenEye (1995)|
|American Presiden...|
|Dracula: Dead and...|
|        Balto (1995)|
|        Nixon (1995)|
|Cutthroat Island ...|
|       Casino (1995)|
|Sense and Sensibi...|
|   Four Rooms (1995)|
|Ace Ventura: When...|
|  Money Train (1995)|
+--------------------+
only showing top 20 rows



# Get multiple columns

In [None]:
df_pyspark.select(["movieId", "title"]).show()

+-------+--------------------+
|movieId|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|      Jumanji (1995)|
|      3|Grumpier Old Men ...|
|      4|Waiting to Exhale...|
|      5|Father of the Bri...|
|      6|         Heat (1995)|
|      7|      Sabrina (1995)|
|      8| Tom and Huck (1995)|
|      9| Sudden Death (1995)|
|     10|    GoldenEye (1995)|
|     11|American Presiden...|
|     12|Dracula: Dead and...|
|     13|        Balto (1995)|
|     14|        Nixon (1995)|
|     15|Cutthroat Island ...|
|     16|       Casino (1995)|
|     17|Sense and Sensibi...|
|     18|   Four Rooms (1995)|
|     19|Ace Ventura: When...|
|     20|  Money Train (1995)|
+-------+--------------------+
only showing top 20 rows



# View Datatypes

In [None]:
df_pyspark.dtypes

[('movieId', 'int'), ('title', 'string'), ('genres', 'string')]

# Describe Dataset

In [None]:
df_pyspark.describe()

DataFrame[summary: string, movieId: string, title: string, genres: string]

In [None]:
df_pyspark.describe().show()

+-------+------------------+--------------------+------------------+
|summary|           movieId|               title|            genres|
+-------+------------------+--------------------+------------------+
|  count|              9742|                9742|              9742|
|   mean|42200.353623485935|                null|              null|
| stddev| 52160.49485443825|                null|              null|
|    min|                 1|"11'09""01 - Sept...|(no genres listed)|
|    max|            193609|À nous la liberté...|           Western|
+-------+------------------+--------------------+------------------+



# Adding Columns

In [None]:
df_pyspark.withColumn("Movie ID Plus 1", df_pyspark["movieId"]+1)

DataFrame[movieId: int, title: string, genres: string, Movie ID Plus 1: int]

# Drop Column

In [None]:
df_pyspark.drop("Movie ID Plus 1")

DataFrame[movieId: int, title: string, genres: string]

# Rename Columns

In [None]:
df_pyspark.withColumnRenamed("movieId", "New Name")

DataFrame[New Name: int, title: string, genres: string]

In [None]:
df_pyspark.withColumnRenamed("New Name", "movieId")

DataFrame[movieId: int, title: string, genres: string]