
## Overview

Here is in this notebook you can find how to read a **CSV** file.<br>
Using **inferSchema** and using a **structType**<br>
explore **pyspark.sql.Types** libraries to import while using **structType** and **structField**<br>
This notebook is written in **Python** so the default cell type is Python.

In [0]:
dbutils.fs.ls("/FileStore/tables/")

[FileInfo(path='dbfs:/FileStore/tables/addresses.csv', name='addresses.csv', size=397, modificationTime=1727409570000),
 FileInfo(path='dbfs:/FileStore/tables/baby_names.csv', name='baby_names.csv', size=7447879, modificationTime=1726971369000),
 FileInfo(path='dbfs:/FileStore/tables/basketball.csv', name='basketball.csv', size=1352, modificationTime=1727410075000),
 FileInfo(path='dbfs:/FileStore/tables/biostats1.csv', name='biostats1.csv', size=331, modificationTime=1727467750000),
 FileInfo(path='dbfs:/FileStore/tables/fs_test/', name='fs_test/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/mv_files/', name='mv_files/', size=0, modificationTime=0)]

In [0]:
df=spark.read.format("csv")\
	.option("inferschema", True)\
	.option("sep", ",")\
	.option("header", True)\
	.load("/FileStore/tables/biostats1.csv")

df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)



In [0]:
df1 = spark.read.format("csv").options(inferschema="True", sep=",", header ="True").load("/FileStore/tables/biostats1.csv")

#display(df1)
print(df1.count())
#df1.printSchema()
df1.createOrReplaceTempView("biostats1_infer")
#spark.sql("select * from biostats1_infer").display()
df1.write.mode("overwrite").format("parquet").saveAsTable("biostats1_infer")

18


In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType,DecimalType

schema_defined = StructType([StructField('Name', StringType(), True),
                             StructField('Sex', StringType(), True),
                             StructField('Age', DecimalType(), True),
                             StructField('Height', DecimalType(), True),
                             StructField('Weight', DecimalType(), True)
                             ])

In [0]:
df2 = spark.read.format("csv").schema(schema_defined).options(header="True",sep=",").load("/FileStore/tables/biostats1.csv")

df2.write.mode("overwrite").format("parquet").saveAsTable("biostats1_schemadef")

#df2.printSchema()  
#display(df2)

In [0]:
spark.sql("select * from default.biostats1_schemadef").show()

+----+---+---+------+------+
|Name|Sex|Age|Height|Weight|
+----+---+---+------+------+
|Alex|  M| 41|    74|   170|
|Bert|  M| 42|    68|   166|
|Carl|  M| 32|    70|   155|
|Dave|  M| 39|    72|   167|
|Elly|  F| 30|    66|   124|
|Fran|  F| 33|    66|   115|
|Gwen|  F| 26|    64|   121|
|Hank|  M| 30|    71|   158|
|Ivan|  M| 53|    72|   175|
|Jake|  M| 32|    69|   143|
|Kate|  F| 47|    69|   139|
|Luke|  M| 34|    72|   163|
|Myra|  F| 23|    62|    98|
|Neil|  M| 36|    75|   160|
|Omar|  M| 38|    70|   145|
|Page|  F| 31|    67|   135|
|Quin|  M| 29|    71|   176|
|Ruth|  F| 28|    65|   131|
+----+---+---+------+------+



In [0]:
schema_new = 'Name STRING, Sex STRING, Age INTEGER, Height INTEGER, Weight INTEGER'

In [0]:
df3 = spark.read.format("csv").schema(schema_new).options(header="True",sep=",").load("/FileStore/tables/biostats1.csv")

df3.createOrReplaceTempView("biostats1_schemanew")

In [0]:
%sql
drop table if exists default.biostats1_schemanew

org.apache.spark.sql.catalyst.parser.ParseException: 
[PARSE_SYNTAX_ERROR] Syntax error at or near '%'. SQLSTATE: 42601 (line 2, pos 0)

== SQL ==
drop table if exists default.biostats1_schemanew
%run _sqldf
^^^

	at org.apache.spark.sql.catalyst.parser.ParseException.withCommand(parsers.scala:308)
	at org.apache.spark.sql.catalyst.parser.AbstractParser.parse(parsers.scala:114)
	at org.apache.spark.sql.execution.SparkSqlParser.parse(SparkSqlParser.scala:137)
	at org.apache.spark.sql.catalyst.parser.AbstractSqlParser.parsePlan(AbstractSqlParser.scala:106)
	at com.databricks.sql.parser.DatabricksSqlParser.$anonfun$parsePlan$1(DatabricksSqlParser.scala:80)
	at com.databricks.sql.parser.DatabricksSqlParser.parse(DatabricksSqlParser.scala:101)
	at com.databricks.sql.parser.DatabricksSqlParser.parsePlan(DatabricksSqlParser.scala:77)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$7(SparkSession.scala:952)
	at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:94)
	at org.

In [0]:
spark.sql("create table default.biostats1_schemanew using parquet as select * from biostats1_schemanew")
spark.sql("select * from default.biostats1_schemanew").show()

+----+---+---+------+------+
|Name|Sex|Age|Height|Weight|
+----+---+---+------+------+
|Alex|  M| 41|    74|   170|
|Bert|  M| 42|    68|   166|
|Carl|  M| 32|    70|   155|
|Dave|  M| 39|    72|   167|
|Elly|  F| 30|    66|   124|
|Fran|  F| 33|    66|   115|
|Gwen|  F| 26|    64|   121|
|Hank|  M| 30|    71|   158|
|Ivan|  M| 53|    72|   175|
|Jake|  M| 32|    69|   143|
|Kate|  F| 47|    69|   139|
|Luke|  M| 34|    72|   163|
|Myra|  F| 23|    62|    98|
|Neil|  M| 36|    75|   160|
|Omar|  M| 38|    70|   145|
|Page|  F| 31|    67|   135|
|Quin|  M| 29|    71|   176|
|Ruth|  F| 28|    65|   131|
+----+---+---+------+------+

