**Topic**
- Pyspark dataframe
- Reading the dataset
- Checking the datatypes of the column (schema)
- Selecting columns and indexing
- Check describe option similar to pandas
- Adding columns
- Dropping columns
- Renaming columns

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('dataframe').getOrCreate()
spark

In [5]:
# reading dataset
df_test2=spark.read.option('header','true').csv('test2.csv',inferSchema=True)

In [6]:
df_test2.show()

+------+---+----------+
|  name|age|experience|
+------+---+----------+
|shayan| 24|        10|
|  lisa| 22|         6|
|  irin| 23|         4|
+------+---+----------+



In [7]:
# checking schema or dtypes
df_test2.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [10]:
df_test2=spark.read.csv('test2.csv',header=True,inferSchema=True)
df_test2.show()

+------+---+----------+
|  name|age|experience|
+------+---+----------+
|shayan| 24|        10|
|  lisa| 22|         6|
|  irin| 23|         4|
+------+---+----------+



In [11]:
type(df_test2)

pyspark.sql.dataframe.DataFrame

In [12]:
df_test2.columns

['name', 'age', 'experience']

In [14]:
df_test2.head(3)

[Row(name='shayan', age=24, experience=10),
 Row(name='lisa', age=22, experience=6),
 Row(name='irin', age=23, experience=4)]

In [15]:
df_test2.show()

+------+---+----------+
|  name|age|experience|
+------+---+----------+
|shayan| 24|        10|
|  lisa| 22|         6|
|  irin| 23|         4|
+------+---+----------+



In [19]:
test2_name=df_test2.select('experience')
test2_name.show()

+----------+
|experience|
+----------+
|        10|
|         6|
|         4|
+----------+



In [20]:
test2_name_age=df_test2.select(['name','age'])
test2_name_age.show()

+------+---+
|  name|age|
+------+---+
|shayan| 24|
|  lisa| 22|
|  irin| 23|
+------+---+



In [21]:
df_test2.dtypes

[('name', 'string'), ('age', 'int'), ('experience', 'int')]

In [22]:
df_test2.describe()

DataFrame[summary: string, name: string, age: string, experience: string]

In [23]:
df_test2.describe().show()

+-------+------+----+------------------+
|summary|  name| age|        experience|
+-------+------+----+------------------+
|  count|     3|   3|                 3|
|   mean|  null|23.0| 6.666666666666667|
| stddev|  null| 1.0|3.0550504633038935|
|    min|  irin|  22|                 4|
|    max|shayan|  24|                10|
+-------+------+----+------------------+



In [26]:
# add columns
df_test2_exp2=df_test2.withColumn('experience after 2 years',df_test2['experience']+2)
df_test2_exp2.show()

+------+---+----------+------------------------+
|  name|age|experience|experience after 2 years|
+------+---+----------+------------------------+
|shayan| 24|        10|                      12|
|  lisa| 22|         6|                       8|
|  irin| 23|         4|                       6|
+------+---+----------+------------------------+



In [28]:
# dropping columns
df_test2_noexp2=df_test2_exp2.drop('experience after 2 years')
df_test2_noexp2.show()

+------+---+----------+
|  name|age|experience|
+------+---+----------+
|shayan| 24|        10|
|  lisa| 22|         6|
|  irin| 23|         4|
+------+---+----------+



In [30]:
# rename
df_test2_namecolchanged=df_test2.withColumnRenamed('name','Name')
df_test2_namecolchanged.show()

+------+---+----------+
|  Name|age|experience|
+------+---+----------+
|shayan| 24|        10|
|  lisa| 22|         6|
|  irin| 23|         4|
+------+---+----------+

