In [76]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, split

In [2]:
spark = SparkSession.builder.appName('Dataframes').master('local[*]').getOrCreate()

In [14]:
schema = StructType(
    [
        StructField('id',IntegerType(),False),
        StructField('name',StringType(),True),
        StructField('dept',StringType(),True),
    ]
)

In [15]:
create table employee (
    id int NOT NULL,
    name string,
    dept string
)

SyntaxError: invalid syntax (4195466400.py, line 1)

In [20]:
d = [
    {'id':1,'name':'anna','dept':'cse'},
    {'id':2,'name':'john','dept':'cse'},
    {'id':3,'name':'bob','dept':'cse'},
    {'id':4,'name':'alice','dept':'cse'}
]

In [24]:
spark.createDataFrame(d).show()

+----+---+-----+
|dept| id| name|
+----+---+-----+
| cse|  1| anna|
| cse|  2| john|
| cse|  3|  bob|
| cse|  4|alice|
+----+---+-----+



In [29]:
spark.createDataFrame([1,2,3]).show()

TypeError: SparkSession.createDataFrame() got an unexpected keyword argument 'columns'

# reading

In [31]:
spark.read.csv('HR-Dataset/core_dataset.csv',header=True).show()

+--------------------+---------------+-----+----------+----------+---+------+-----------+-------------------+---------------+--------------------+------------+-------------------+--------------------+--------------------+----------------+--------------------+--------+------------------+--------------------+--------------------+
|       Employee Name|Employee Number|State|       Zip|       DOB|Age|   Sex|MaritalDesc|        CitizenDesc|Hispanic/Latino|            RaceDesc|Date of Hire|Date of Termination|     Reason For Term|   Employment Status|      Department|            Position|Pay Rate|      Manager Name|     Employee Source|   Performance Score|
+--------------------+---------------+-----+----------+----------+---+------+-----------+-------------------+---------------+--------------------+------------+-------------------+--------------------+--------------------+----------------+--------------------+--------+------------------+--------------------+--------------------+
|         

In [43]:
spark.read.option('header','true').csv('HR-Dataset/core_dataset.csv',inferSchema=True).dtypes

[('Employee Name', 'string'),
 ('Employee Number', 'int'),
 ('State', 'string'),
 ('Zip', 'double'),
 ('DOB', 'string'),
 ('Age', 'int'),
 ('Sex', 'string'),
 ('MaritalDesc', 'string'),
 ('CitizenDesc', 'string'),
 ('Hispanic/Latino', 'string'),
 ('RaceDesc', 'string'),
 ('Date of Hire', 'string'),
 ('Date of Termination', 'string'),
 ('Reason For Term', 'string'),
 ('Employment Status', 'string'),
 ('Department', 'string'),
 ('Position', 'string'),
 ('Pay Rate', 'double'),
 ('Manager Name', 'string'),
 ('Employee Source', 'string'),
 ('Performance Score', 'string')]

In [35]:
spark.read.option('header','true').format('csv').load('./HR-Dataset/core_dataset.csv').show()

+--------------------+---------------+-----+----------+----------+---+------+-----------+-------------------+---------------+--------------------+------------+-------------------+--------------------+--------------------+----------------+--------------------+--------+------------------+--------------------+--------------------+
|       Employee Name|Employee Number|State|       Zip|       DOB|Age|   Sex|MaritalDesc|        CitizenDesc|Hispanic/Latino|            RaceDesc|Date of Hire|Date of Termination|     Reason For Term|   Employment Status|      Department|            Position|Pay Rate|      Manager Name|     Employee Source|   Performance Score|
+--------------------+---------------+-----+----------+----------+---+------+-----------+-------------------+---------------+--------------------+------------+-------------------+--------------------+--------------------+----------------+--------------------+--------+------------------+--------------------+--------------------+
|         

In [44]:
spark.read.option('header','true').option('inferSchema','true').format('csv').load('./HR-Dataset/core_dataset.csv').dtypes

[('Employee Name', 'string'),
 ('Employee Number', 'int'),
 ('State', 'string'),
 ('Zip', 'double'),
 ('DOB', 'string'),
 ('Age', 'int'),
 ('Sex', 'string'),
 ('MaritalDesc', 'string'),
 ('CitizenDesc', 'string'),
 ('Hispanic/Latino', 'string'),
 ('RaceDesc', 'string'),
 ('Date of Hire', 'string'),
 ('Date of Termination', 'string'),
 ('Reason For Term', 'string'),
 ('Employment Status', 'string'),
 ('Department', 'string'),
 ('Position', 'string'),
 ('Pay Rate', 'double'),
 ('Manager Name', 'string'),
 ('Employee Source', 'string'),
 ('Performance Score', 'string')]

# DataFrame Operations

In [85]:
emp_df = spark.read.option('header','true').option('inferSchema','true').format('csv').load('./HR-Dataset/core_dataset.csv')

In [55]:
emp_df.count()

302

In [59]:
# where & filter
emp_df.where(col('State')=='TX').count()

3

In [58]:
emp_df.where(col('State')!='TX').count()

298

In [62]:
emp_df.filter(col('State')=='TX').show()

+---------------+---------------+-----+-------+---------+---+----+-----------+-------------------+---------------+--------------------+------------+-------------------+--------------------+-----------------+----------+--------------------+--------+-------------+------------------+--------------------+
|  Employee Name|Employee Number|State|    Zip|      DOB|Age| Sex|MaritalDesc|        CitizenDesc|Hispanic/Latino|            RaceDesc|Date of Hire|Date of Termination|     Reason For Term|Employment Status|Department|            Position|Pay Rate| Manager Name|   Employee Source|   Performance Score|
+---------------+---------------+-----+-------+---------+---+----+-----------+-------------------+---------------+--------------------+------------+-------------------+--------------------+-----------------+----------+--------------------+--------+-------------+------------------+--------------------+
| Murray, Thomas|     1406068403|   TX|78230.0| 7/4/1988| 29|Male|   Divorced|         US C

In [75]:
# select count(*) from emp where Sex='Male' and Age between 25 and 30 and MaritalDesc in ('Single', 'Seperated', 'Divorced')
emp_df.where( 
    (col('Sex')=='Male') & 
    (col('Age')<=30) & 
    (col('Age')>=25) & 
    ((col('MaritalDesc')=='Single') | (col('MaritalDesc')=='Seperated') | (col('MaritalDesc')=='Divorced'))
).count()

11

### withColumn

In [87]:
emp_df.withColumn('First_name', split(col('Employee Name'),', ').getItem(0)).\
withColumn('Last_name', split(col('Employee Name'),', ').getItem(1)).\
withColumn('new_pay_rate', col('Pay Rate')*1.1 ).\
select('First_name','Last_name','new_pay_rate','Department').show()

+------------+----------+------------------+----------------+
|  First_name| Last_name|      new_pay_rate|      Department|
+------------+----------+------------------+----------------+
|       Brown|       Mia|             31.35|   Admin Offices|
|   LaRotonda| William  |              25.3|   Admin Offices|
|      Steans|  Tyrone  |31.900000000000002|   Admin Offices|
|      Howard|   Estelle|23.650000000000002|   Admin Offices|
|       Singh|      Nan |            18.216|   Admin Offices|
|       Smith| Leigh Ann|             22.55|   Admin Offices|
|     LeBlanc|Brandon  R| 60.50000000000001|   Admin Offices|
|       Quinn|      Sean| 60.50000000000001|   Admin Offices|
|    Boutwell|   Bonalyn| 38.44500000000001|   Admin Offices|
|Foster-Baker|       Amy| 38.44500000000001|   Admin Offices|
|        King|     Janet|              88.0|Executive Office|
|      Zamora|  Jennifer|              71.5|           IT/IS|
|      Becker|     Renee|47.300000000000004|           IT/IS|
|       

In [88]:
emp_df = emp_df.withColumn('First_name', split(col('Employee Name'),', ').getItem(0)).\
withColumn('Last_name', split(col('Employee Name'),', ').getItem(1)).\
withColumn('new_pay_rate', col('Pay Rate')*1.1 ).\
select('First_name','Last_name','new_pay_rate','Department')

In [92]:
emp_df.write.format('csv').mode('append').save('emp_df.csv')

In [93]:
spark.read.csv('emp_df.csv').count()

604

In [96]:
emp_df.write.mode('overwrite').csv('emp_df.csv')

In [95]:
spark.read.csv('emp_df.csv').count()

302