# Notebook Content
* PySpark DataFrame
* Reading Dataset
* Checking Schema
* Selecting Columns and Indexing
* Describe Function - Similar to pandas describe
* Adding Columns
* Dropping Columns
* Renaming Columns

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

23/02/15 01:17:33 WARN Utils: Your hostname, Sujeets-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.29.122 instead (on interface en0)
23/02/15 01:17:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/15 01:17:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark

In [7]:
# read dataset
df_spark = spark.read.csv('test2.csv')
df_spark.show()

                                                                                

+------+---+----------+
|   _c0|_c1|       _c2|
+------+---+----------+
|  Name|Age|Experience|
|Sujeet| 22|        10|
| Ajeet| 18|         8|
|   Cam| 14|         4|
+------+---+----------+



In [8]:
df_spark = spark.read.csv('test2.csv', header=True)
df_spark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|Sujeet| 22|        10|
| Ajeet| 18|         8|
|   Cam| 14|         4|
+------+---+----------+



In [9]:
#schema
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [10]:
df_spark = spark.read.csv('test2.csv',inferSchema=True, header=True)

In [11]:
df_spark.show(5)

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|Sujeet| 22|        10|
| Ajeet| 18|         8|
|   Cam| 14|         4|
+------+---+----------+



In [12]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [13]:
# another way of reading data
df_pyspark = spark.read.option('header','true').csv('test2.csv', inferSchema=True) #inferSchema is used to init true datatype of data
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|Sujeet| 22|        10|
| Ajeet| 18|         8|
|   Cam| 14|         4|
+------+---+----------+



In [14]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [15]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [16]:
df_pyspark.head(3)

[Row(Name='Sujeet', Age=22, Experience=10),
 Row(Name='Ajeet', Age=18, Experience=8),
 Row(Name='Cam', Age=14, Experience=4)]

In [17]:
df_pyspark.select('Name').show()

+------+
|  Name|
+------+
|Sujeet|
| Ajeet|
|   Cam|
+------+



In [18]:
df_pyspark.select(['Name', 'Age']).show()

+------+---+
|  Name|Age|
+------+---+
|Sujeet| 22|
| Ajeet| 18|
|   Cam| 14|
+------+---+



In [19]:
df_pyspark['Name']

Column<'Name'>

In [20]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

In [21]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [22]:
df_pyspark.describe().show()

[Stage 13:>                                                         (0 + 1) / 1]

+-------+------+----+-----------------+
|summary|  Name| Age|       Experience|
+-------+------+----+-----------------+
|  count|     3|   3|                3|
|   mean|  null|18.0|7.333333333333333|
| stddev|  null| 4.0|3.055050463303893|
|    min| Ajeet|  14|                4|
|    max|Sujeet|  22|               10|
+-------+------+----+-----------------+



                                                                                

In [23]:
# Adding Columns in DataFrame

In [24]:
df_pyspark = df_pyspark.withColumn('Experience+2', df_pyspark['Experience']+2)

In [25]:
df_pyspark.show()

+------+---+----------+------------+
|  Name|Age|Experience|Experience+2|
+------+---+----------+------------+
|Sujeet| 22|        10|          12|
| Ajeet| 18|         8|          10|
|   Cam| 14|         4|           6|
+------+---+----------+------------+



In [26]:
#drop columns
df_pyspark = df_pyspark.drop('Experience+2')

In [27]:
df_pyspark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|Sujeet| 22|        10|
| Ajeet| 18|         8|
|   Cam| 14|         4|
+------+---+----------+



In [28]:
#rename columns
df_pyspark = df_pyspark.withColumnRenamed('Name', 'New Name')

In [29]:
df_pyspark.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|  Sujeet| 22|        10|
|   Ajeet| 18|         8|
|     Cam| 14|         4|
+--------+---+----------+

