### Inferred and explicit schemas

In [1]:
from pyspark import SparkContext

In [2]:
sc=SparkContext()
sc

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Inferred and explicit schemas") \
        .getOrCreate()

In [4]:
from pyspark.sql.types import Row

#### Inferring schema

#### Load the text dataset

Format of dataset- (name of employee, employee's id, age, date_of_joining, gender)

In [5]:
records = sc.textFile("../datasets/employees.txt")

records.collect()

['William,1103024456,32,2000,Male',
 'Leonara,1102024115,33,2004,Female',
 'Thomas,1406068403,29,2009,Male',
 'Elisa,1011022863,31,1999,Female',
 'Michael,1501072311,49,1994,Male']

In [6]:
records

../datasets/employees.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [6]:
records_formatted = records.map(lambda l: l.split(","))

records_formatted.collect()

[['William', '1103024456', '32', '2000', 'Male'],
 ['Leonara', '1102024115', '33', '2004', 'Female'],
 ['Thomas', '1406068403', '29', '2009', 'Male'],
 ['Elisa', '1011022863', '31', '1999', 'Female'],
 ['Michael', '1501072311', '49', '1994', 'Male']]

In [7]:
employees = records_formatted.map(lambda a: Row(employee_name =a [0], 
                                                id = int(a[1]), 
                                                age = int(a[2]), 
                                                date_of_joining = int(a[3]), 
                                                gender = a[4]))

In [8]:
employees.collect()

[Row(age=32, date_of_joining=2000, employee_name='William', gender='Male', id=1103024456),
 Row(age=33, date_of_joining=2004, employee_name='Leonara', gender='Female', id=1102024115),
 Row(age=29, date_of_joining=2009, employee_name='Thomas', gender='Male', id=1406068403),
 Row(age=31, date_of_joining=1999, employee_name='Elisa', gender='Female', id=1011022863),
 Row(age=49, date_of_joining=1994, employee_name='Michael', gender='Male', id=1501072311)]

In [9]:
employees_df_imp = spark.createDataFrame(employees)

employees_df_imp.createOrReplaceTempView("employees_view")

In [10]:
employees_df_imp.columns

['age', 'date_of_joining', 'employee_name', 'gender', 'id']

In [11]:
employees_df_imp.schema

StructType(List(StructField(age,LongType,true),StructField(date_of_joining,LongType,true),StructField(employee_name,StringType,true),StructField(gender,StringType,true),StructField(id,LongType,true)))

In [12]:
spark.sql("SELECT * FROM employees_view").show()

+---+---------------+-------------+------+----------+
|age|date_of_joining|employee_name|gender|        id|
+---+---------------+-------------+------+----------+
| 32|           2000|      William|  Male|1103024456|
| 33|           2004|      Leonara|Female|1102024115|
| 29|           2009|       Thomas|  Male|1406068403|
| 31|           1999|        Elisa|Female|1011022863|
| 49|           1994|      Michael|  Male|1501072311|
+---+---------------+-------------+------+----------+



#### Explicit schema

In [13]:
records_formatted.collect()

[['William', '1103024456', '32', '2000', 'Male'],
 ['Leonara', '1102024115', '33', '2004', 'Female'],
 ['Thomas', '1406068403', '29', '2009', 'Male'],
 ['Elisa', '1011022863', '31', '1999', 'Female'],
 ['Michael', '1501072311', '49', '1994', 'Male']]

In [14]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [15]:
fields = [StructField('id', StringType(), True),
          StructField('employee_name', StringType(), True),
          StructField('gender', StringType(), True),
          StructField('age', IntegerType(), True),
          StructField('date_of_joining', IntegerType(), True)
]

In [16]:
explicit_schema = StructType(fields)

In [17]:
employees_df_exp = spark.createDataFrame(records_formatted, 
                                         explicit_schema)

In [18]:
employees_df_exp.columns

['id', 'employee_name', 'gender', 'age', 'date_of_joining']

In [19]:
employees_df_exp.schema

StructType(List(StructField(id,StringType,true),StructField(employee_name,StringType,true),StructField(gender,StringType,true),StructField(age,IntegerType,true),StructField(date_of_joining,IntegerType,true)))

In [20]:
spark.sql("SELECT * FROM employees_view").show()

+---+---------------+-------------+------+----------+
|age|date_of_joining|employee_name|gender|        id|
+---+---------------+-------------+------+----------+
| 32|           2000|      William|  Male|1103024456|
| 33|           2004|      Leonara|Female|1102024115|
| 29|           2009|       Thomas|  Male|1406068403|
| 31|           1999|        Elisa|Female|1011022863|
| 49|           1994|      Michael|  Male|1501072311|
+---+---------------+-------------+------+----------+

