### pyspark DataFrames - part -1

In [66]:
import pyspark

In [67]:
from pyspark.sql import SparkSession

In [68]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()

In [69]:
spark

In [70]:
# Reading the Dataset
df_pyspark = spark.read.csv('Data1.csv')

In [71]:
# column heading name by default it gives random name
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string]

In [72]:

df_pyspark.show()

+-------+---+----------+
|    _c0|_c1|       _c2|
+-------+---+----------+
|   Name|Age|Experience|
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [73]:
### Now we need to change the heading with the orignal names by using 'header' 

In [74]:
df_pyspark = spark.read.option('header','true').csv('Data1.csv')

In [75]:
df_pyspark

DataFrame[Name: string, Age: string, Experience: string]

In [76]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [77]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [78]:
df_pyspark = spark.read.option('header','true').csv('Data1.csv',inferSchema = True)

In [79]:
df_pyspark

DataFrame[Name: string, Age: int, Experience: int]

In [80]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [81]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [82]:
df_pyspark = spark.read.csv('Data1.csv',header = True , inferSchema = True)

In [83]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [84]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [85]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [86]:
df_pyspark.columns[0:3]

['Name', 'Age', 'Experience']

In [87]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [88]:
df_pyspark.select('Age')

DataFrame[Age: int]

In [89]:
df_pyspark.select('Age').show()

+---+
|Age|
+---+
| 24|
| 23|
| 22|
| 21|
| 20|
+---+



In [90]:
# Datatypes for each column heading name
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [91]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [92]:
# description for dataset
df_pyspark.describe().show()

+-------+-----+------------------+------------------+
|summary| Name|               Age|        Experience|
+-------+-----+------------------+------------------+
|  count|    5|                 5|                 5|
|   mean| null|              22.0|               5.4|
| stddev| null|1.5811388300841898|3.5071355833500366|
|    min| aman|                20|                 1|
|    max|shiva|                24|                10|
+-------+-----+------------------+------------------+



In [93]:
# when we want to use particular columns use "select" 
df_pyspark.select(['Name','Age','Experience']).show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [94]:
# Adding the columns in Data frame

df_pyspark.withColumn('Company_Name',df_pyspark['Experience']).show()

+-------+---+----------+------------+
|   Name|Age|Experience|Company_Name|
+-------+---+----------+------------+
|  shiva| 24|         3|           3|
|   aman| 23|         6|           6|
|  hydra| 22|         7|           7|
|kakashi| 21|        10|          10|
| naruto| 20|         1|           1|
+-------+---+----------+------------+



In [95]:
# Dropping the column
df_pyspark.drop('Company_Name')

DataFrame[Name: string, Age: int, Experience: int]

In [96]:
df_pyspark.show()

+-------+---+----------+
|   Name|Age|Experience|
+-------+---+----------+
|  shiva| 24|         3|
|   aman| 23|         6|
|  hydra| 22|         7|
|kakashi| 21|        10|
| naruto| 20|         1|
+-------+---+----------+



In [100]:
# Rename the column Names
df_pyspark.withColumnRenamed('Experience',df_pyspark['Job_Experience'])

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Job_Experience` cannot be resolved. Did you mean one of the following? [`Name`, `Age`, `Experience`].

In [64]:
df_pyspark.withColumn('Salary', 'Job_Experience')

PySparkTypeError: [NOT_COLUMN] Argument `col` should be a Column, got str.