In [0]:
from pyspark.sql.types import StructField,StructType,StringType,IntegerType
from pyspark.sql.functions import *
sample_schema=StructType([
                            StructField('id',IntegerType(),True),
                            StructField('name',StringType(),True),
                            StructField('age',IntegerType(),True),
                            StructField('salary',IntegerType(),True),
                            StructField('address',StringType(),True),
                            StructField('nominee',StringType(),True),
                            StructField('_corrupt_record',StringType(),True)

]


)

df_sample=spark.read.format('csv')\
    .option('header','True')\
    .option('inferschema','False')\
    .schema(sample_schema)\
    .option('mode','permissive')\
    .load('/FileStore/tables/sample_data-2.csv')\

df_sample.show(6)

+---+--------+---+------+------------+--------+--------------------+
| id|    name|age|salary|     address| nominee|     _corrupt_record|
+---+--------+---+------+------------+--------+--------------------+
|  1|  Manish| 26| 75000|       bihar|nominee1|                null|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|                null|
|  3|  Pritam| 22|150000|   Bangalore|   India|3,Pritam,22,15000...|
|  4|Prantosh| 17|200000|     Kolkata|   India|4,Prantosh,17,200...|
|  5|  Vikash| 31|300000|        null|nominee5|                null|
+---+--------+---+------+------------+--------+--------------------+



In [0]:
df_sample.select(col('name').alias('emp_name'),col('age')).show()

+--------+---+
|emp_name|age|
+--------+---+
|  Manish| 26|
|  Nikita| 23|
|  Pritam| 22|
|Prantosh| 17|
|  Vikash| 31|
+--------+---+



In [0]:
df_sample.filter(col('salary')>150000).show()

+---+--------+---+------+-------+--------+--------------------+
| id|    name|age|salary|address| nominee|     _corrupt_record|
+---+--------+---+------+-------+--------+--------------------+
|  4|Prantosh| 17|200000|Kolkata|   India|4,Prantosh,17,200...|
|  5|  Vikash| 31|300000|   null|nominee5|                null|
+---+--------+---+------+-------+--------+--------------------+



In [0]:
df_sample.filter((col('salary')>150000) & (col('age')<18)).show()

+---+--------+---+------+-------+-------+--------------------+
| id|    name|age|salary|address|nominee|     _corrupt_record|
+---+--------+---+------+-------+-------+--------------------+
|  4|Prantosh| 17|200000|Kolkata|  India|4,Prantosh,17,200...|
+---+--------+---+------+-------+-------+--------------------+



In [0]:
df_sample.select('*',lit('kumar').alias('last_name')).show()

+---+--------+---+------+------------+--------+--------------------+---------+
| id|    name|age|salary|     address| nominee|     _corrupt_record|last_name|
+---+--------+---+------+------------+--------+--------------------+---------+
|  1|  Manish| 26| 75000|       bihar|nominee1|                null|    kumar|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|                null|    kumar|
|  3|  Pritam| 22|150000|   Bangalore|   India|3,Pritam,22,15000...|    kumar|
|  4|Prantosh| 17|200000|     Kolkata|   India|4,Prantosh,17,200...|    kumar|
|  5|  Vikash| 31|300000|        null|nominee5|                null|    kumar|
+---+--------+---+------+------------+--------+--------------------+---------+



In [0]:
df_sample.withColumn('sir_name',lit('kumar')).show()

+---+--------+---+------+------------+--------+--------------------+--------+
| id|    name|age|salary|     address| nominee|     _corrupt_record|sir_name|
+---+--------+---+------+------------+--------+--------------------+--------+
|  1|  Manish| 26| 75000|       bihar|nominee1|                null|   kumar|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|                null|   kumar|
|  3|  Pritam| 22|150000|   Bangalore|   India|3,Pritam,22,15000...|   kumar|
|  4|Prantosh| 17|200000|     Kolkata|   India|4,Prantosh,17,200...|   kumar|
|  5|  Vikash| 31|300000|        null|nominee5|                null|   kumar|
+---+--------+---+------+------------+--------+--------------------+--------+



In [0]:
df_sample.withColumnRenamed('id','emp_id').show()

+------+--------+---+------+------------+--------+--------------------+
|emp_id|    name|age|salary|     address| nominee|     _corrupt_record|
+------+--------+---+------+------------+--------+--------------------+
|     1|  Manish| 26| 75000|       bihar|nominee1|                null|
|     2|  Nikita| 23|100000|uttarpradesh|nominee2|                null|
|     3|  Pritam| 22|150000|   Bangalore|   India|3,Pritam,22,15000...|
|     4|Prantosh| 17|200000|     Kolkata|   India|4,Prantosh,17,200...|
|     5|  Vikash| 31|300000|        null|nominee5|                null|
+------+--------+---+------+------------+--------+--------------------+



In [0]:
df_sample.show(

)

+---+--------+---+------+------------+--------+--------------------+
| id|    name|age|salary|     address| nominee|     _corrupt_record|
+---+--------+---+------+------------+--------+--------------------+
|  1|  Manish| 26| 75000|       bihar|nominee1|                null|
|  2|  Nikita| 23|100000|uttarpradesh|nominee2|                null|
|  3|  Pritam| 22|150000|   Bangalore|   India|3,Pritam,22,15000...|
|  4|Prantosh| 17|200000|     Kolkata|   India|4,Prantosh,17,200...|
|  5|  Vikash| 31|300000|        null|nominee5|                null|
+---+--------+---+------+------------+--------+--------------------+



In [0]:
df_sample.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)
 |-- _corrupt_record: string (nullable = true)



In [0]:
df_sample.withColumn('id',col('id').cast('string')).withColumn('salary',col('salary').cast('string')).printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: string (nullable = true)
 |-- address: string (nullable = true)
 |-- nominee: string (nullable = true)
 |-- _corrupt_record: string (nullable = true)

