## Manipulating Columns in PySpark

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('spark_app_3').master('local[*]').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv('car_price_dataset.csv', header=True, inferSchema=True)

In [5]:
df.show(5)

+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|     Brand| Model|Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|
+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
|       Kia|   Rio|2020|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501|
| Chevrolet|Malibu|2012|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092|
|  Mercedes|   GLA|2020|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171|
|      Audi|    Q5|2023|        2.0| Electric|        Manual| 160971|    2|          1|11780|
|Volkswagen|  Golf|2003|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|
+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+
only showing top 5 rows



In [6]:
from pyspark.sql.functions import lit, rand
# lit: It basically creates a column and assigins value to that column
# docs: 
df = df.withColumn('new_price', lit(df.Price+((18/100)*df.Price)))
df.show(5)

+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+---------+
|     Brand| Model|Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|Price|new_price|
+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+---------+
|       Kia|   Rio|2020|        4.2|   Diesel|        Manual| 289944|    3|          5| 8501| 10031.18|
| Chevrolet|Malibu|2012|        2.0|   Hybrid|     Automatic|   5356|    2|          3|12092| 14268.56|
|  Mercedes|   GLA|2020|        4.2|   Diesel|     Automatic| 231440|    4|          2|11171| 13181.78|
|      Audi|    Q5|2023|        2.0| Electric|        Manual| 160971|    2|          1|11780|  13900.4|
|Volkswagen|  Golf|2003|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3| 2867|  3383.06|
+----------+------+----+-----------+---------+--------------+-------+-----+-----------+-----+---------+
only showing top 5 rows



In [7]:
 # Drop a column
df.drop('Price').show()

+----------+--------+----+-----------+---------+--------------+-------+-----+-----------+---------+
|     Brand|   Model|Year|Engine_Size|Fuel_Type|  Transmission|Mileage|Doors|Owner_Count|new_price|
+----------+--------+----+-----------+---------+--------------+-------+-----+-----------+---------+
|       Kia|     Rio|2020|        4.2|   Diesel|        Manual| 289944|    3|          5| 10031.18|
| Chevrolet|  Malibu|2012|        2.0|   Hybrid|     Automatic|   5356|    2|          3| 14268.56|
|  Mercedes|     GLA|2020|        4.2|   Diesel|     Automatic| 231440|    4|          2| 13181.78|
|      Audi|      Q5|2023|        2.0| Electric|        Manual| 160971|    2|          1|  13900.4|
|Volkswagen|    Golf|2003|        2.6|   Hybrid|Semi-Automatic| 286618|    3|          3|  3383.06|
|    Toyota|   Camry|2007|        2.7|   Petrol|     Automatic| 157889|    4|          4|  8545.56|
|     Honda|   Civic|2010|        3.4| Electric|     Automatic| 139584|    3|          1| 13225.44|
