<a href="https://colab.research.google.com/github/saurater/ciencia_de_dados_pyspark/blob/main/PySpark_Tutorial_Part_2_Dataframe_Columns_Operations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#PySpark - Tutorial - Part II - Dataframe Columns Operations
## Sam Faraday
June 2022

## Index

1. PySpark Dataframe
2. Reading the Dataset
3. Checking the Datatypes of the Columns(Schema)
4. Check Describe option similar to Pandas
5. Adding Columns
6. Dropping Columns
7. Renaming Coluns


# 1. Installing PySpark

In [None]:
pip install pyspark

# 2. Creating the Test2 Dataset

In [3]:
import pandas as pd

In [4]:
data = {'Name':['Tom', 'Nick', 'Krish', 'Jack'], 'Age':[20, 21, 19, 18], 'Experience': [12,10,8,4]}

In [5]:
# Create DataFrame
df = pd.DataFrame(data)

In [6]:
df.to_csv('test2.csv', index=False)

# 3. Initializing PySpark

In [7]:
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder.appName("Dataframe").getOrCreate()

In [9]:
spark

# 4. Reading the Dataset

In [10]:
df_spark = spark.read.option('header', 'true').csv('test2.csv')
df_spark

DataFrame[Name: string, Age: string, Experience: string]

In [11]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [12]:
df_spark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|  Tom| 20|        12|
| Nick| 21|        10|
|Krish| 19|         8|
| Jack| 18|         4|
+-----+---+----------+



# 5. Checking the Schema

In [13]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



# 6. Infering the Schema

In [14]:
df_spark = spark.read.option('header', 'true').csv('test2.csv', inferSchema=True)

In [15]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [16]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

# 7. Selecting and Showing Columns

In [17]:
df_spark.columns

['Name', 'Age', 'Experience']

In [18]:
df_spark.head(2)

[Row(Name='Tom', Age=20, Experience=12),
 Row(Name='Nick', Age=21, Experience=10)]

In [19]:
df_spark.show()

+-----+---+----------+
| Name|Age|Experience|
+-----+---+----------+
|  Tom| 20|        12|
| Nick| 21|        10|
|Krish| 19|         8|
| Jack| 18|         4|
+-----+---+----------+



In [20]:
df_spark.select("Name").show()

+-----+
| Name|
+-----+
|  Tom|
| Nick|
|Krish|
| Jack|
+-----+



In [21]:
df_spark.select("Name", "Age").show()

+-----+---+
| Name|Age|
+-----+---+
|  Tom| 20|
| Nick| 21|
|Krish| 19|
| Jack| 18|
+-----+---+



In [22]:
df_spark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

# 8. Describing the dataframe

In [23]:
df_spark.describe().show()

+-------+----+------------------+-----------------+
|summary|Name|               Age|       Experience|
+-------+----+------------------+-----------------+
|  count|   4|                 4|                4|
|   mean|null|              19.5|              8.5|
| stddev|null|1.2909944487358056|3.415650255319866|
|    min|Jack|                18|                4|
|    max| Tom|                21|               12|
+-------+----+------------------+-----------------+



# 8. Adding Columns

In [24]:
df_spark= df_spark.withColumn('Experience after 2 years', df_spark['Experience'] +2 )

In [32]:
df_spark.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|     Tom| 20|        12|
|    Nick| 21|        10|
|   Krish| 19|         8|
|    Jack| 18|         4|
+--------+---+----------+



# 9.Removing Columns

In [26]:
df_spark=df_spark.drop('Experience after 2 years')


# 10. Renaming Columns

In [37]:
df_spark = df_spark.withColumnRenamed('Name', 'New Name')
df_spark.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|     Tom| 20|        12|
|    Nick| 21|        10|
|   Krish| 19|         8|
|    Jack| 18|         4|
+--------+---+----------+

