In [None]:
pip install pyspark



In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .appName("PySparkExample") \
    .getOrCreate()

In [None]:
#Crate a DataFrame with sample data
data = [("Sri", 20), ("Senthil", 21), ("Rahul", 19)]
df = spark.createDataFrame(data, ["Name", "Age"])

In [None]:
#Show the DataFrame
df.show()

+-------+---+
|   Name|Age|
+-------+---+
|    Sri| 20|
|Senthil| 21|
|  Rahul| 19|
+-------+---+



In [None]:
#Stop the Spark session
spark.stop()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySparkExample").getOrCreate()
data = [("blueberry", 16), ("kaly", 18), ("soniya", 21)]
df = spark.createDataFrame(data,["Name","Age"])
df.show()
spark.stop()

+---------+---+
|     Name|Age|
+---------+---+
|blueberry| 16|
|     kaly| 18|
|   soniya| 21|
+---------+---+



In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("StudentAssignment").getOrCreate()

In [None]:
# Sample employee data
data = [
(1, "Alice", "Engineering", 65000),
(2, "Bob", "Marketing", 58000),
(3, "Charlie", "Sales", 52000),
(4, "David", "Engineering", 72000),
(5, "Eve", "Sales", 54000)
]

In [None]:
schema = ["ID", "Name", "Department", "Salary"]
df = spark.createDataFrame(data, schema=schema)
df.show()

+---+-------+-----------+------+
| ID|   Name| Department|Salary|
+---+-------+-----------+------+
|  1|  Alice|Engineering| 65000|
|  2|    Bob|  Marketing| 58000|
|  3|Charlie|      Sales| 52000|
|  4|  David|Engineering| 72000|
|  5|    Eve|      Sales| 54000|
+---+-------+-----------+------+



In [None]:
#Show schema
df.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



In [None]:
# Filter: Salary > 60000
df.filter(df["Salary"] > 60000).show()

+---+-----+-----------+------+
| ID| Name| Department|Salary|
+---+-----+-----------+------+
|  1|Alice|Engineering| 65000|
|  4|David|Engineering| 72000|
+---+-----+-----------+------+



In [None]:
# Group by Department
df.groupBy("Department").count().show()

+-----------+-----+
| Department|count|
+-----------+-----+
|Engineering|    2|
|  Marketing|    1|
|      Sales|    2|
+-----------+-----+



In [None]:
# Average Salary by Department
df.groupBy("Department").avg("Salary").show()

+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|Engineering|    68500.0|
|  Marketing|    58000.0|
|      Sales|    53000.0|
+-----------+-----------+



In [None]:
from pyspark.sql import SparkSession
from google.colab import files

# Upload file
uploaded = files.upload()

# Get the filename from the uploaded dictionary
filename = list(uploaded.keys())[0]

# Initialize Spark
spark = SparkSession.builder.appName("Spark DataFrames").getOrCreate()

# Read uploaded file using the correct filename
df = spark.read.csv(filename, header=True, inferSchema=True)

df.show()
df.printSchema()

Saving employees1 (1).csv to employees1 (1) (2).csv
+-----------+----------+---------+--------+------------+---------+----------+------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|
+-----------+----------+---------+--------+------------+---------+----------+------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-Jun-07|  SH_CLERK|  2600|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-Jan-08|  SH_CLERK|  2600|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-Sep-03|   AD_ASST|  4400|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-Feb-04|    MK_MAN| 13000|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-Aug-05|    MK_REP|  6000|
|        203|     Susan|   Mavris| SMAVRIS|515.123.7777|07-Jun-02|    HR_REP|  6500|
|        204|   Hermann|     Baer|   HBAER|515.123.8888|07-Jun-02|    PR_REP| 10000|
|        205|   Shelley|  Higgins|SHIGGINS|515.123.8080|07-Jun-02|    AC_MGR| 12008|
|        206|

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("EmployeeDataAnalysis") \
.getOrCreate()

In [None]:
# Assume you uploaded employees.csv to Databricks at /tmp/employees.csv.
# Get the filename from the uploaded dictionary
filename = list(uploaded.keys())[0]

df = spark.read.option("header", True).option("inferSchema",True).csv(filename)
df.show()

+-----------+----------+---------+--------+------------+---------+----------+------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|
+-----------+----------+---------+--------+------------+---------+----------+------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-Jun-07|  SH_CLERK|  2600|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-Jan-08|  SH_CLERK|  2600|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-Sep-03|   AD_ASST|  4400|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-Feb-04|    MK_MAN| 13000|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-Aug-05|    MK_REP|  6000|
|        203|     Susan|   Mavris| SMAVRIS|515.123.7777|07-Jun-02|    HR_REP|  6500|
|        204|   Hermann|     Baer|   HBAER|515.123.8888|07-Jun-02|    PR_REP| 10000|
|        205|   Shelley|  Higgins|SHIGGINS|515.123.8080|07-Jun-02|    AC_MGR| 12008|
|        206|   William|    Gietz|  WGIETZ|515.123.8181|07-Jun-02

In [None]:
df.printSchema()

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: integer (nullable = true)



In [None]:
df.groupBy("JOB_ID").avg("Salary").show()

+----------+-----------+
|    JOB_ID|avg(Salary)|
+----------+-----------+
|FI_ACCOUNT|     7920.0|
|    MK_MAN|    13000.0|
|   IT_PROG|     5760.0|
|    FI_MGR|    12008.0|
|AC_ACCOUNT|     8300.0|
|    HR_REP|     6500.0|
|  PU_CLERK|     2780.0|
|    AC_MGR|    12008.0|
|    PR_REP|    10000.0|
|    ST_MAN|     7280.0|
|    MK_REP|     6000.0|
|    PU_MAN|    11000.0|
|  SH_CLERK|     2600.0|
|   AD_PRES|    24000.0|
|   AD_ASST|     4400.0|
|  ST_CLERK|     2750.0|
|     AD_VP|    17000.0|
+----------+-----------+



In [None]:
df = df.withColumn("Bonus", df["Salary"] * 0.10)
df.show()

+-----------+----------+---------+--------+------------+---------+----------+------+------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY| Bonus|
+-----------+----------+---------+--------+------------+---------+----------+------+------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-Jun-07|  SH_CLERK|  2600| 260.0|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-Jan-08|  SH_CLERK|  2600| 260.0|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-Sep-03|   AD_ASST|  4400| 440.0|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-Feb-04|    MK_MAN| 13000|1300.0|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-Aug-05|    MK_REP|  6000| 600.0|
|        203|     Susan|   Mavris| SMAVRIS|515.123.7777|07-Jun-02|    HR_REP|  6500| 650.0|
|        204|   Hermann|     Baer|   HBAER|515.123.8888|07-Jun-02|    PR_REP| 10000|1000.0|
|        205|   Shelley|  Higgins|SHIGGINS|515.123.8080|07-Jun-02|    AC_MGR| 12

In [None]:
df.filter(df["Salary"] > 70000).show()

+-----------+----------+---------+-----+------------+---------+------+------+-----+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|EMAIL|PHONE_NUMBER|HIRE_DATE|JOB_ID|SALARY|Bonus|
+-----------+----------+---------+-----+------------+---------+------+------+-----+
+-----------+----------+---------+-----+------------+---------+------+------+-----+



In [None]:
df.groupBy("JOB_ID").avg("Salary").show()

+----------+-----------+
|    JOB_ID|avg(Salary)|
+----------+-----------+
|FI_ACCOUNT|     7920.0|
|    MK_MAN|    13000.0|
|   IT_PROG|     5760.0|
|    FI_MGR|    12008.0|
|AC_ACCOUNT|     8300.0|
|    HR_REP|     6500.0|
|  PU_CLERK|     2780.0|
|    AC_MGR|    12008.0|
|    PR_REP|    10000.0|
|    ST_MAN|     7280.0|
|    MK_REP|     6000.0|
|    PU_MAN|    11000.0|
|  SH_CLERK|     2600.0|
|   AD_PRES|    24000.0|
|   AD_ASST|     4400.0|
|  ST_CLERK|     2750.0|
|     AD_VP|    17000.0|
+----------+-----------+

