In [2]:
# Initialize

#Imports
import pandas as pd
pd.set_option("display.max_columns", None)
# Load Data
employees = pd.read_csv("../data/employees.csv")
departments = pd.read_csv("../data/departments.csv")
sales = pd.read_csv("../data/sales.csv")

In [2]:
# Select specific columns (SQL: SELECT column1, column2)
employees[["name", "department", "salary"]]


Unnamed: 0,name,department,salary
0,Amit,Engineering,80000
1,Neha,Engineering,75000
2,Ravi,HR,50000
3,Pooja,HR,55000
4,Karan,Sales,60000
5,Suman,Sales,58000
6,Rahul,Engineering,90000
7,Anita,Finance,70000
8,Vikram,Finance,72000
9,Meena,HR,52000


In [3]:
# Filter rows where salary is greater than 80000 (SQL: WHERE salary > 80000)
employees[employees["salary"] > 80000]


Unnamed: 0,emp_id,name,department,salary,joining_date
6,7,Rahul,Engineering,90000,2019-09-12
10,11,Arjun,Engineering,85000,2023-01-10
16,17,Sunita,Engineering,82000,2021-12-01
19,20,Isha,Engineering,88000,2019-05-05


In [4]:
# Filter employees from Engineering department
employees[employees["department"] == "Engineering"]


Unnamed: 0,emp_id,name,department,salary,joining_date
0,1,Amit,Engineering,80000,2021-06-15
1,2,Neha,Engineering,75000,2022-03-10
6,7,Rahul,Engineering,90000,2019-09-12
10,11,Arjun,Engineering,85000,2023-01-10
12,13,Nikhil,Engineering,78000,2020-07-07
16,17,Sunita,Engineering,82000,2021-12-01
19,20,Isha,Engineering,88000,2019-05-05


In [10]:
# Apply multiple conditions using AND (&)
employees[(employees["department"] == "Engineering") & (employees["salary"] > 80000)]


Unnamed: 0,emp_id,name,department,salary,joining_date
6,7,Rahul,Engineering,90000,2019-09-12
10,11,Arjun,Engineering,85000,2023-01-10
16,17,Sunita,Engineering,82000,2021-12-01
19,20,Isha,Engineering,88000,2019-05-05


In [11]:
employees[(employees["department"]=="HR") | (employees["salary"] > 80000)]

Unnamed: 0,emp_id,name,department,salary,joining_date
2,3,Ravi,HR,50000,2020-01-20
3,4,Pooja,HR,55000,2021-11-05
6,7,Rahul,Engineering,90000,2019-09-12
9,10,Meena,HR,52000,2022-06-20
10,11,Arjun,Engineering,85000,2023-01-10
13,14,Kavita,HR,54000,2019-02-14
16,17,Sunita,Engineering,82000,2021-12-01
17,18,Manoj,HR,51000,2020-10-10
19,20,Isha,Engineering,88000,2019-05-05


In [3]:
# Filter using isin() for multiple categorical values
employees[employees["department"].isin(["Engineering", "Finance"])]

Unnamed: 0,emp_id,name,department,salary,joining_date
0,1,Amit,Engineering,80000,2021-06-15
1,2,Neha,Engineering,75000,2022-03-10
6,7,Rahul,Engineering,90000,2019-09-12
7,8,Anita,Finance,70000,2021-04-01
8,9,Vikram,Finance,72000,2020-12-15
10,11,Arjun,Engineering,85000,2023-01-10
12,13,Nikhil,Engineering,78000,2020-07-07
15,16,Deepak,Finance,75000,2022-09-09
16,17,Sunita,Engineering,82000,2021-12-01
19,20,Isha,Engineering,88000,2019-05-05


In [4]:
# Sort employees by salary ascending (SQL: ORDER BY salary ASC)
employees.sort_values(by="salary")

Unnamed: 0,emp_id,name,department,salary,joining_date
2,3,Ravi,HR,50000,2020-01-20
17,18,Manoj,HR,51000,2020-10-10
9,10,Meena,HR,52000,2022-06-20
13,14,Kavita,HR,54000,2019-02-14
3,4,Pooja,HR,55000,2021-11-05
5,6,Suman,Sales,58000,2022-08-18
14,15,Rohan,Sales,59000,2023-05-30
4,5,Karan,Sales,60000,2023-02-01
18,19,Pankaj,Sales,61000,2022-01-15
11,12,Priya,Sales,62000,2021-03-25


In [5]:
# Sort employees by salary descending
employees.sort_values(by="salary", ascending=False)

Unnamed: 0,emp_id,name,department,salary,joining_date
6,7,Rahul,Engineering,90000,2019-09-12
19,20,Isha,Engineering,88000,2019-05-05
10,11,Arjun,Engineering,85000,2023-01-10
16,17,Sunita,Engineering,82000,2021-12-01
0,1,Amit,Engineering,80000,2021-06-15
12,13,Nikhil,Engineering,78000,2020-07-07
1,2,Neha,Engineering,75000,2022-03-10
15,16,Deepak,Finance,75000,2022-09-09
8,9,Vikram,Finance,72000,2020-12-15
7,8,Anita,Finance,70000,2021-04-01


In [6]:
# Sort by multiple columns (department ascending, salary descending)
employees.sort_values(by=["department", "salary"], ascending=[True, False])

Unnamed: 0,emp_id,name,department,salary,joining_date
6,7,Rahul,Engineering,90000,2019-09-12
19,20,Isha,Engineering,88000,2019-05-05
10,11,Arjun,Engineering,85000,2023-01-10
16,17,Sunita,Engineering,82000,2021-12-01
0,1,Amit,Engineering,80000,2021-06-15
12,13,Nikhil,Engineering,78000,2020-07-07
1,2,Neha,Engineering,75000,2022-03-10
15,16,Deepak,Finance,75000,2022-09-09
8,9,Vikram,Finance,72000,2020-12-15
7,8,Anita,Finance,70000,2021-04-01


In [7]:
# Create a new derived column (salary category using condition)
employees["salary_category"] = employees["salary"].apply(
    lambda x: "High" if x > 80000 else "Medium" if x > 60000 else "Low"
)

employees.head()

Unnamed: 0,emp_id,name,department,salary,joining_date,salary_category
0,1,Amit,Engineering,80000,2021-06-15,Medium
1,2,Neha,Engineering,75000,2022-03-10,Medium
2,3,Ravi,HR,50000,2020-01-20,Low
3,4,Pooja,HR,55000,2021-11-05,Low
4,5,Karan,Sales,60000,2023-02-01,Low


In [9]:
# Filter using datetime column (employees joined after 2021)
employees[employees["joining_date"] > "2021-01-01"]

Unnamed: 0,emp_id,name,department,salary,joining_date,salary_category
0,1,Amit,Engineering,80000,2021-06-15,Medium
1,2,Neha,Engineering,75000,2022-03-10,Medium
3,4,Pooja,HR,55000,2021-11-05,Low
4,5,Karan,Sales,60000,2023-02-01,Low
5,6,Suman,Sales,58000,2022-08-18,Low
7,8,Anita,Finance,70000,2021-04-01,Medium
9,10,Meena,HR,52000,2022-06-20,Low
10,11,Arjun,Engineering,85000,2023-01-10,High
11,12,Priya,Sales,62000,2021-03-25,Medium
14,15,Rohan,Sales,59000,2023-05-30,Low


In [11]:
# Extract year from datetime column
employees["joining_date"] = pd.to_datetime(employees["joining_date"])
employees["joining_year"] = employees["joining_date"].dt.year

employees.head()

Unnamed: 0,emp_id,name,department,salary,joining_date,salary_category,joining_year
0,1,Amit,Engineering,80000,2021-06-15,Medium,2021
1,2,Neha,Engineering,75000,2022-03-10,Medium,2022
2,3,Ravi,HR,50000,2020-01-20,Low,2020
3,4,Pooja,HR,55000,2021-11-05,Low,2021
4,5,Karan,Sales,60000,2023-02-01,Low,2023
