### pyspark DataFrames - part -1

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [5]:
spark

In [45]:
# Reading the Dataset
df_pyspark = spark.read.csv('Data1.csv')

In [46]:
# column heading name by default it gives random name
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string]

In [47]:

df_pyspark.show()

+-------+---+----------+
|    _c0|_c1|       _c2|
+-------+---+----------+
|   Name|Age|Experience|
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [48]:
### Now we need to change the heading with the orignal names by using 'header' 

In [49]:
df_pyspark = spark.read.option('header','true').csv('Data1.csv')

In [50]:
df_pyspark

DataFrame[Name: string, Age: string, Experience: string]

In [51]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [52]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [53]:
df_pyspark

DataFrame[Name: string, Age: string, Experience: string]

In [54]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [55]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [56]:
df_pyspark = spark.read.csv('Data1.csv',header = True , inferSchema = True)

In [57]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [58]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [59]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [22]:
df_pyspark.columns[0:3]

['Name', 'Age', 'Experience']

In [23]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [24]:
df_pyspark.select('Age')

DataFrame[Age: int]

In [60]:
df_pyspark.select('Age').show()

+---+
|Age|
+---+
| 24|
| 23|
| 22|
| 21|
| 20|
+---+



In [61]:
# Datatypes for each column heading name
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [62]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [63]:
# description for dataset
df_pyspark.describe().show()

+-------+-----+------------------+------------------+
|summary| Name|               Age|        Experience|
+-------+-----+------------------+------------------+
|  count|    5|                 5|                 5|
|   mean| null|              22.0|               5.4|
| stddev| null|1.5811388300841898|3.5071355833500366|
|    min| aman|                20|                 1|
|    max|shiva|                24|                10|
+-------+-----+------------------+------------------+



In [64]:
# when we want to use particular columns use "select" 
df_pyspark.select(['Name','Age','Experience'])

DataFrame[Name: string, Age: int, Experience: int]

In [78]:
# Adding the columns in Data frame

df_pyspark.withColumn('Company_Name',df_pyspark['Experience']).show()

+-------+---+----------+------------+
|   Name|Age|Experience|Company_Name|
+-------+---+----------+------------+
|  shiva| 24|         3|           3|
|   aman| 23|         6|           6|
|  hydra| 22|         7|           7|
|kakashi| 21|        10|          10|
| naruto| 20|         1|           1|
+-------+---+----------+------------+



In [66]:
# Dropping the column
df_pyspark.drop('Company_Name')

DataFrame[Name: string, Age: int, Experience: int]

In [69]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [73]:
# Rename the column Names
df_pyspark.withColumnRenamed('Experience', 'Job_Experience').show()

+-------+---+--------------+
|   Name|Age|Job_Experience|
+-------+---+--------------+
|  shiva| 24|             3|
|   aman| 23|             6|
|  hydra| 22|             7|
|kakashi| 21|            10|
| naruto| 20|             1|
+-------+---+--------------+



In [77]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [82]:
df_pyspark.withColumn('salary',df_pyspark['Age']).show()

+-------+---+----------+------+
|   Name|Age|Experience|salary|
+-------+---+----------+------+
|  shiva| 24|         3|    24|
|   aman| 23|         6|    23|
|  hydra| 22|         7|    22|
|kakashi| 21|        10|    21|
| naruto| 20|         1|    20|
+-------+---+----------+------+

