In [0]:
df=spark.read.format("csv").option("header","true").load("dbfs:/FileStore/employee.csv")
display(df)

## printSchema()

In [0]:
df.printSchema()


In [0]:
df.column

In [0]:
df.dtypes

In [0]:
df.schema


### Getting the list of columns AS StructField using fields attribute of a StructType object 


In [0]:
df.schema.fields


In [0]:
df.schema.names



### Returning the list of columns and datatypes as JSON using either jsonValue() 


In [0]:
df.schema.json()


In [0]:
df.schema.jsonValue()

## Create Custom Schema
- approach-1

In [0]:
schema = StructType([
    StructField("firstname",StringType(),True),
    StructField("middlename",StringType(),True),
    StructField("lastname",StringType(),True),
    StructField("id", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", IntegerType(), True)
  ])

# # approach-2
# schema_def = StructType()

# schema_def.add("firstname",StringType(),True)
# schema_def.add("middlename",StringType(),True)
# schema_def.add("lastname",StringType(),True)
# schema_def.add("id", StringType(), True)
# schema_def.add("gender", StringType(), True)
# schema_def.add("salary", IntegerType(), True)

# # approach-3

# schema_def.add("firstname","string",True)
# schema_def.add("middlename","string",True)
# schema_def.add("lastname","string",True)
# schema_def.add("id", "string", True)
# schema_def.add("gender", "string", True)
# schema_def.add("salary", "integer", True)



## Nested structure schema

In [0]:
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]
structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2 = spark.createDataFrame(data=structureData,schema=structureSchema)
df2.printSchema()
df2.show(truncate=False)

In [0]:
updatedDF = df2.withColumn("OtherInfo",
                           struct(col("id").alias("identifier"),
                                  col("gender").alias("gender"),
                                  col("salary").alias("salary"),
    when(col("salary").cast(IntegerType()) < 2000,"Low")
      .when(col("salary").cast(IntegerType()) < 4000,"Medium")
      .otherwise("High").alias("Salary_Grade")
  )).drop("id","gender","salary")

updatedDF.printSchema()
updatedDF.show(truncate=False)

In [0]:
""" Array & Map"""
arrayStructureSchema = StructType([
    StructField('name', StructType([
       StructField('firstname', StringType(), True),
       StructField('middlename', StringType(), True),
       StructField('lastname', StringType(), True)
       ])),
       StructField('hobbies', ArrayType(StringType()), True),
       StructField('properties', MapType(StringType(),StringType()), True)
    ])

In [0]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType, TimestampType
# Add more types as needed

# Initialize Spark session
spark = SparkSession.builder.appName("PandasToSpark").getOrCreate()

# Sample Pandas DataFrame (replace with your actual DataFrame)
data = {'col_string': ['a', 'b', 'c'],
        'col_int': [1, 2, 3],
        'col_float': [1.1, 2.2, 3.3],
        'col_date': ['2025-03-01', '2025-03-02', '2025-03-03'],
        'col_timestamp': ['2025-03-01 10:00:00', '2025-03-02 11:00:00', '2025-03-03 12:00:00']}
pandas_df = pd.DataFrame(data)

# 1. Get column names and data types
column_names = pandas_df.columns.tolist()
column_types_pandas = pandas_df.dtypes.apply(lambda x: x.name).to_dict()

# 2. Define Spark schema
schema_list = []
for col_name, col_type in column_types_pandas.items():
    if col_type == 'object': # strings in pandas
         schema_list.append(StructField(col_name, StringType(), True))
    elif col_type == 'int64':
        schema_list.append(StructField(col_name, IntegerType(), True))
    elif col_type == 'float64':
        schema_list.append(StructField(col_name, FloatType(), True))
    elif col_type == 'datetime64[ns]':
        schema_list.append(StructField(col_name, DateType(), True)) # or TimestampType
    else:
        schema_list.append(StructField(col_name, StringType(), True)) # default to string if type not recognized

schema = StructType(schema_list)

# 3. Create Spark DataFrame
spark_df = spark.createDataFrame(pandas_df, schema=schema)

# Print the schema of the Spark DataFrame
spark_df.printSchema()

# Show the first few rows of the Spark DataFrame
spark_df.show()

# Stop the Spark session
spark.stop()