<a href="https://colab.research.google.com/github/sirishaallarapu/AdvancedPySpark-/blob/main/Day1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

spark = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

data = [
    (1, "Siri", 25, "New York"),
    (2, "Nani", 30, "Los Angeles"),
    (3, "Jasmine", 28, "Chicago"),
    (4, "Elizabeth", 35, "San Francisco")
]
columns = ["ID", "Name", "Age", "City"]

df = spark.createDataFrame(data, columns)

df.show()

df.select("Name", "Age").show()

df.filter(col("Age") > 28).show()

df.where(col("City") == "Chicago").show()

print(f"Total Records: {df.count()}")

df.withColumn("Age_in_5_years", col("Age") + 5).show()

df.drop("City").show()

spark.stop()


+---+---------+---+-------------+
| ID|     Name|Age|         City|
+---+---------+---+-------------+
|  1|     Siri| 25|     New York|
|  2|     Nani| 30|  Los Angeles|
|  3|  Jasmine| 28|      Chicago|
|  4|Elizabeth| 35|San Francisco|
+---+---------+---+-------------+

+---------+---+
|     Name|Age|
+---------+---+
|     Siri| 25|
|     Nani| 30|
|  Jasmine| 28|
|Elizabeth| 35|
+---------+---+

+---+---------+---+-------------+
| ID|     Name|Age|         City|
+---+---------+---+-------------+
|  2|     Nani| 30|  Los Angeles|
|  4|Elizabeth| 35|San Francisco|
+---+---------+---+-------------+

+---+-------+---+-------+
| ID|   Name|Age|   City|
+---+-------+---+-------+
|  3|Jasmine| 28|Chicago|
+---+-------+---+-------+

Total Records: 4
+---+---------+---+-------------+--------------+
| ID|     Name|Age|         City|Age_in_5_years|
+---+---------+---+-------------+--------------+
|  1|     Siri| 25|     New York|            30|
|  2|     Nani| 30|  Los Angeles|            35|


In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

spark = SparkSession.builder.appName("EmployeeDataFrame").getOrCreate()

data = [
    (201, "Amit", "Finance", 60000),
    (202, "Priya", "IT", 75000),
    (203, "Ravi", "Marketing", 58000),
    (204, "Sanya", "HR", 50000),
    (205, "Vikram", "IT", 80000),
    (206, "Neha", "Operations", 62000)
]
columns = ["Emp_ID", "Name", "Department", "Salary"]

In [None]:
df = spark.createDataFrame(data, columns)

In [None]:
df.show()


+------+------+----------+------+
|Emp_ID|  Name|Department|Salary|
+------+------+----------+------+
|   201|  Amit|   Finance| 60000|
|   202| Priya|        IT| 75000|
|   203|  Ravi| Marketing| 58000|
|   204| Sanya|        HR| 50000|
|   205|Vikram|        IT| 80000|
|   206|  Neha|Operations| 62000|
+------+------+----------+------+



In [None]:
df.select("Name", "Salary").show()

+------+------+
|  Name|Salary|
+------+------+
|  Amit| 60000|
| Priya| 75000|
|  Ravi| 58000|
| Sanya| 50000|
|Vikram| 80000|
|  Neha| 62000|
+------+------+



In [None]:
df.filter(col("Salary") > 60000).show()

+------+------+----------+------+
|Emp_ID|  Name|Department|Salary|
+------+------+----------+------+
|   202| Priya|        IT| 75000|
|   205|Vikram|        IT| 80000|
|   206|  Neha|Operations| 62000|
+------+------+----------+------+



In [None]:
df.where(col("Department") == "IT").show()

+------+------+----------+------+
|Emp_ID|  Name|Department|Salary|
+------+------+----------+------+
|   202| Priya|        IT| 75000|
|   205|Vikram|        IT| 80000|
+------+------+----------+------+



In [None]:
print(f"Total Employees: {df.count()}")

Total Employees: 6


In [None]:
df.withColumn("Bonus", col("Salary") * 0.12).show()

+------+------+----------+------+------+
|Emp_ID|  Name|Department|Salary| Bonus|
+------+------+----------+------+------+
|   201|  Amit|   Finance| 60000|7200.0|
|   202| Priya|        IT| 75000|9000.0|
|   203|  Ravi| Marketing| 58000|6960.0|
|   204| Sanya|        HR| 50000|6000.0|
|   205|Vikram|        IT| 80000|9600.0|
|   206|  Neha|Operations| 62000|7440.0|
+------+------+----------+------+------+



In [None]:
df.withColumn("Location", lit("Bangalore Office")).show()

+------+------+----------+------+----------------+
|Emp_ID|  Name|Department|Salary|        Location|
+------+------+----------+------+----------------+
|   201|  Amit|   Finance| 60000|Bangalore Office|
|   202| Priya|        IT| 75000|Bangalore Office|
|   203|  Ravi| Marketing| 58000|Bangalore Office|
|   204| Sanya|        HR| 50000|Bangalore Office|
|   205|Vikram|        IT| 80000|Bangalore Office|
|   206|  Neha|Operations| 62000|Bangalore Office|
+------+------+----------+------+----------------+



In [None]:
df.drop("Department").show()

+------+------+------+
|Emp_ID|  Name|Salary|
+------+------+------+
|   201|  Amit| 60000|
|   202| Priya| 75000|
|   203|  Ravi| 58000|
|   204| Sanya| 50000|
|   205|Vikram| 80000|
|   206|  Neha| 62000|
+------+------+------+



In [None]:
spark.stop()