### topics
* Pyspark DataFrame
* reading the data set
* checking the Datatype of the Columns
* Check Describe option similar to pandas
* adding Columns
* Dropping Columns
* renaming columns

In [71]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [72]:
from pyspark.sql import SparkSession

In [73]:
spark=SparkSession.builder.appName('Dataframe').getOrCreate()

In [74]:
spark

In [75]:
## read the dataset
df_pyspark=spark.read.option('header','true').csv('file2.csv').show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|    ali| 20|         2|
|   reza| 25|         5|
|shahram| 32|         7|
+-------+---+----------+



In [76]:
df_pyspark=spark.read.option('header','true').csv('file2.csv')

In [77]:
## check the schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- experience: string (nullable = true)



In [78]:
## we can see the above types do not show 'integer' for numbers

In [79]:
df_pyspark=spark.read.option('header','true').csv('file2.csv',inferSchema=True)

In [80]:
## check the schema again
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [81]:
df_pyspark=spark.read.csv('file2.csv',header=True,inferSchema=True)
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|    ali| 20|         2|
|   reza| 25|         5|
|shahram| 32|         7|
+-------+---+----------+



In [82]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [83]:
# next---> selecting Columns

In [84]:
df_pyspark.head(3)

[Row(name='ali', age=20, experience=2),
 Row(name='reza', age=25, experience=5),
 Row(name='shahram', age=32, experience=7)]

In [85]:
df_pyspark.show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|    ali| 20|         2|
|   reza| 25|         5|
|shahram| 32|         7|
+-------+---+----------+



In [86]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [87]:
df_pyspark.select('Name').show()

+-------+
|   Name|
+-------+
|    ali|
|   reza|
|shahram|
+-------+



In [88]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [89]:
# pick two columns
df_pyspark.select('Name','Experience')

DataFrame[Name: string, Experience: int]

In [90]:
df_pyspark.select('Name','Experience').show()

+-------+----------+
|   Name|Experience|
+-------+----------+
|    ali|         2|
|   reza|         5|
|shahram|         7|
+-------+----------+



In [91]:
# check the data types
df_pyspark.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

In [92]:
df_pyspark.describe()

DataFrame[summary: string, name: string, age: string, experience: string]

In [93]:
df_pyspark.describe().show()

+-------+-------+------------------+-----------------+
|summary|   name|               age|       experience|
+-------+-------+------------------+-----------------+
|  count|      3|                 3|                3|
|   mean|   null|25.666666666666668|4.666666666666667|
| stddev|   null| 6.027713773341708|2.516611478423583|
|    min|    ali|                20|                2|
|    max|shahram|                32|                7|
+-------+-------+------------------+-----------------+



In [94]:
# adding Columns in data frames
new_data_Frame=df_pyspark.withColumn('Experience after 2 years',df_pyspark['Experience']+2)

In [95]:
new_data_Frame.show()

+-------+---+----------+------------------------+
|   name|age|experience|Experience after 2 years|
+-------+---+----------+------------------------+
|    ali| 20|         2|                       4|
|   reza| 25|         5|                       7|
|shahram| 32|         7|                       9|
+-------+---+----------+------------------------+



In [96]:
## drop the columns
df_pyspark.drop('Experience after 2 years').show()

+-------+---+----------+
|   name|age|experience|
+-------+---+----------+
|    ali| 20|         2|
|   reza| 25|         5|
|shahram| 32|         7|
+-------+---+----------+



In [97]:
#rename the columns
df_pyspark.withColumnRenamed('name','New_Name').show()

+--------+---+----------+
|New_Name|age|experience|
+--------+---+----------+
|     ali| 20|         2|
|    reza| 25|         5|
| shahram| 32|         7|
+--------+---+----------+

