In [3]:
# First, let's import pandas - think of this as 'starting psql'
import pandas as pd

In [4]:
# Create the same employees data you've been using
employees_data = {
    'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'first_name': ['Ana', 'Ben', 'Cara', 'Dave', 'Ella', 'Frank', 'Grace', 'Henry', 'Iris', 'Jack'],
    'last_name': ['Gonzalez', 'Nguyen', 'Ivanova', 'Okafor', 'Schmidt', 'Chen', 'Williams', 'Johnson', 'Patel', 'Brown'],
    'dept': ['Sales', 'Marketing', 'Tech', 'Tech', 'HR', None, None, 'Finance', 'Tech', 'Sales'],
    'salary': [68000, 72000, 98000, 105000, 60000, 55000, 62000, 75000, 88000, None],
    'manager_id': [None, 1, 4, 4, 1, 1, 1, 10, 4, None],
    'hire_date': ['2015-01-10', '2018-06-22', '2022-03-15', '2017-11-05', '2019-09-01', 
                  '2020-12-12', '2023-04-18', '2016-07-03', '2023-01-09', '2021-05-20']
}

In [5]:
# Create DataFrame - this is like CREATE TABLE + INSERT in SQL
employees = pd.DataFrame(employees_data)

# Let's see what we created - like SELECT * FROM employees
employees  # In Jupyter, just typing the variable name displays it nicely

Unnamed: 0,id,first_name,last_name,dept,salary,manager_id,hire_date
0,1,Ana,Gonzalez,Sales,68000.0,,2015-01-10
1,2,Ben,Nguyen,Marketing,72000.0,1.0,2018-06-22
2,3,Cara,Ivanova,Tech,98000.0,4.0,2022-03-15
3,4,Dave,Okafor,Tech,105000.0,4.0,2017-11-05
4,5,Ella,Schmidt,HR,60000.0,1.0,2019-09-01
5,6,Frank,Chen,,55000.0,1.0,2020-12-12
6,7,Grace,Williams,,62000.0,1.0,2023-04-18
7,8,Henry,Johnson,Finance,75000.0,10.0,2016-07-03
8,9,Iris,Patel,Tech,88000.0,4.0,2023-01-09
9,10,Jack,Brown,Sales,,,2021-05-20


In [6]:
employees.group(dept).head(3)

AttributeError: 'DataFrame' object has no attribute 'group'

In [9]:
# Basic info about your DataFrame
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          10 non-null     int64  
 1   first_name  10 non-null     object 
 2   last_name   10 non-null     object 
 3   dept        8 non-null      object 
 4   salary      9 non-null      float64
 5   manager_id  8 non-null      float64
 6   hire_date   10 non-null     object 
dtypes: float64(2), int64(1), object(4)
memory usage: 692.0+ bytes


In [10]:
# SQL: SELECT * FROM employees WHERE dept = 'Tech'
tech_employees = employees[employees['dept'] == 'Tech']
tech_employees

Unnamed: 0,id,first_name,last_name,dept,salary,manager_id,hire_date
2,3,Cara,Ivanova,Tech,98000.0,4.0,2022-03-15
3,4,Dave,Okafor,Tech,105000.0,4.0,2017-11-05
8,9,Iris,Patel,Tech,88000.0,4.0,2023-01-09


In [14]:
# SQL: SELECT first_name, salary FROM employees WHERE salary > 80000
high_earners = employees[employees['salary'] > 80000][['first_name', 'salary']]
high_earners.sort_values('salary', ascending=False)

Unnamed: 0,first_name,salary
3,Dave,105000.0
2,Cara,98000.0
8,Iris,88000.0


In [19]:
emp = employees[employees['dept'].isin(['Tech', 'Sales'])][['first_name', 'dept','salary']]
emp.sort_values('salary', ascending=False)

Unnamed: 0,first_name,dept,salary
3,Dave,Tech,105000.0
2,Cara,Tech,98000.0
8,Iris,Tech,88000.0
0,Ana,Sales,68000.0
9,Jack,Sales,


In [34]:
yummi = employees[employees['salary'].notna()]
result = yummi.groupby('dept')['salary'].agg(['count', 'mean'])
result.sort_values('mean', ascending=False)
# Reset the index to make dept a regular column
result.reset_index()
# Rename the columns to be more readable
result.columns = ['employee_count', 'avg_salary']
result

Unnamed: 0_level_0,employee_count,avg_salary
dept,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,1,75000.0
HR,1,60000.0
Marketing,1,72000.0
Sales,1,68000.0
Tech,3,97000.0


In [57]:
no_empty_depts = employees[employees['dept'].notna()]
result = no_empty_depts.groupby('dept')['salary'].agg(['max', 'min'])
result

Unnamed: 0_level_0,max,min
dept,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,75000.0,75000.0
HR,60000.0,60000.0
Marketing,72000.0,72000.0
Sales,68000.0,68000.0
Tech,105000.0,88000.0


In [59]:
import numpy as np

In [65]:
employees['ranged'] = np.where(employees['salary'] > 90000, 'High', np.where(employees['salary'] < 70000, 'Low', 'Medium'))
employees
employees.groupby('ranged').size()

ranged
High      2
Low       4
Medium    4
dtype: int64

In [112]:
salary_by_dept = employees.groupby('dept')[['salary']].agg('sum') # get the grouped salaries
totalSalary = employees['salary'].sum() # get total salary
salary_by_dept['dept_pct'] = ((salary_by_dept['salary'] / totalSalary)* 100).round() #add column of pct
salary_by_dept #print

Unnamed: 0_level_0,salary,dept_pct
dept,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,75000.0,11.0
HR,60000.0,9.0
Marketing,72000.0,11.0
Sales,68000.0,10.0
Tech,291000.0,43.0


In [119]:
employees['new_salary'] = np.where(employees['salary'].isna(), 65000, employees['salary'] * 1.1)
employees

Unnamed: 0,id,first_name,last_name,dept,salary,manager_id,hire_date,ranged,new_salary
0,1,Ana,Gonzalez,Sales,68000.0,,2015-01-10,Low,74800.0
1,2,Ben,Nguyen,Marketing,72000.0,1.0,2018-06-22,Medium,79200.0
2,3,Cara,Ivanova,Tech,98000.0,4.0,2022-03-15,High,107800.0
3,4,Dave,Okafor,Tech,105000.0,4.0,2017-11-05,High,115500.0
4,5,Ella,Schmidt,HR,60000.0,1.0,2019-09-01,Low,66000.0
5,6,Frank,Chen,,55000.0,1.0,2020-12-12,Low,60500.0
6,7,Grace,Williams,,62000.0,1.0,2023-04-18,Low,68200.0
7,8,Henry,Johnson,Finance,75000.0,10.0,2016-07-03,Medium,82500.0
8,9,Iris,Patel,Tech,88000.0,4.0,2023-01-09,Medium,96800.0
9,10,Jack,Brown,Sales,,,2021-05-20,Medium,65000.0


In [131]:
new_table = employees.groupby('dept')['salary'].agg(count='count', avg_salary='mean', max_salary='max', min_salary='min')
new_table

Unnamed: 0_level_0,count,avg_salary,max_salary,min_salary
dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Finance,1,75000.0,75000.0,75000.0
HR,1,60000.0,60000.0,60000.0
Marketing,1,72000.0,72000.0,72000.0
Sales,1,68000.0,68000.0,68000.0
Tech,3,97000.0,105000.0,88000.0


In [4]:
import pandas as pd
import numpy as np

# Recreate the employees data
employees_data = {
    'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'first_name': ['Ana', 'Ben', 'Cara', 'Dave', 'Ella', 'Frank', 'Grace', 'Henry', 'Iris', 'Jack'],
    'last_name': ['Gonzalez', 'Nguyen', 'Ivanova', 'Okafor', 'Schmidt', 'Chen', 'Williams', 'Johnson', 'Patel', 'Brown'],
    'dept': ['Sales', 'Marketing', 'Tech', 'Tech', 'HR', None, None, 'Finance', 'Tech', 'Sales'],
    'salary': [68000, 72000, 98000, 105000, 60000, 55000, 62000, 75000, 88000, None],
    'manager_id': [None, 1, 4, 4, 1, 1, 1, 10, 4, None],
    'hire_date': ['2015-01-10', '2018-06-22', '2022-03-15', '2017-11-05', '2019-09-01', 
                  '2020-12-12', '2023-04-18', '2016-07-03', '2023-01-09', '2021-05-20']
}

employees = pd.DataFrame(employees_data)

In [8]:
# Create departments data
departments_data = {
    'dname': ['Tech', 'Sales', 'HR', 'Marketing', 'Legal', 'R&D'],
    'loc': ['Building A', 'Building B', 'Building C', 'Building B', 'Building D', 'Building A'],
    'budget': [500000, 300000, 150000, 250000, 200000, 450000]
}

departments = pd.DataFrame(departments_data)

In [17]:
new_table = employees.groupby('dept')['salary'].agg(count='count', avg_salary='mean', max_salary='max', min_salary='min')
new_table

Unnamed: 0_level_0,count,avg_salary,max_salary,min_salary
dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Finance,1,75000.0,75000.0,75000.0
HR,1,60000.0,60000.0,60000.0
Marketing,1,72000.0,72000.0,72000.0
Sales,1,68000.0,68000.0,68000.0
Tech,3,97000.0,105000.0,88000.0


In [21]:
merged = pd.merge(employees, new_table[['avg_salary']], on='dept')
merged

Unnamed: 0,id,first_name,last_name,dept,salary,manager_id,hire_date,avg_salary
0,1,Ana,Gonzalez,Sales,68000.0,,2015-01-10,68000.0
1,2,Ben,Nguyen,Marketing,72000.0,1.0,2018-06-22,72000.0
2,3,Cara,Ivanova,Tech,98000.0,4.0,2022-03-15,97000.0
3,4,Dave,Okafor,Tech,105000.0,4.0,2017-11-05,97000.0
4,5,Ella,Schmidt,HR,60000.0,1.0,2019-09-01,60000.0
5,8,Henry,Johnson,Finance,75000.0,10.0,2016-07-03,75000.0
6,9,Iris,Patel,Tech,88000.0,4.0,2023-01-09,97000.0
7,10,Jack,Brown,Sales,,,2021-05-20,68000.0


In [24]:
merged['salary_status'] = np.where(merged['salary'] > 1.2 * merged['avg_salary'], 'High Outlier', np.where(merged['salary'] < 0.8 * merged['avg_salary'], 'Low Outlier','Normal'))
merged

Unnamed: 0,id,first_name,last_name,dept,salary,manager_id,hire_date,avg_salary,salary_status
0,1,Ana,Gonzalez,Sales,68000.0,,2015-01-10,68000.0,Normal
1,2,Ben,Nguyen,Marketing,72000.0,1.0,2018-06-22,72000.0,Normal
2,3,Cara,Ivanova,Tech,98000.0,4.0,2022-03-15,97000.0,Normal
3,4,Dave,Okafor,Tech,105000.0,4.0,2017-11-05,97000.0,Normal
4,5,Ella,Schmidt,HR,60000.0,1.0,2019-09-01,60000.0,Normal
5,8,Henry,Johnson,Finance,75000.0,10.0,2016-07-03,75000.0,Normal
6,9,Iris,Patel,Tech,88000.0,4.0,2023-01-09,97000.0,Normal
7,10,Jack,Brown,Sales,,,2021-05-20,68000.0,Normal


In [39]:
import datetime
merged['hire_date'] = pd.to_datetime(merged['hire_date'])
merged['years_employed'] = round((datetime.datetime.today() - merged['hire_date']).dt.days / 365.25)
merged['hire_quarter'] = merged['hire_date'].dt.quarter
merged

Unnamed: 0,id,first_name,last_name,dept,salary,manager_id,hire_date,avg_salary,salary_status,years_employed,hire_quarter
0,1,Ana,Gonzalez,Sales,68000.0,,2015-01-10,68000.0,Normal,11.0,1
1,2,Ben,Nguyen,Marketing,72000.0,1.0,2018-06-22,72000.0,Normal,7.0,2
2,3,Cara,Ivanova,Tech,98000.0,4.0,2022-03-15,97000.0,Normal,3.0,1
3,4,Dave,Okafor,Tech,105000.0,4.0,2017-11-05,97000.0,Normal,8.0,4
4,5,Ella,Schmidt,HR,60000.0,1.0,2019-09-01,60000.0,Normal,6.0,3
5,8,Henry,Johnson,Finance,75000.0,10.0,2016-07-03,75000.0,Normal,9.0,3
6,9,Iris,Patel,Tech,88000.0,4.0,2023-01-09,97000.0,Normal,3.0,1
7,10,Jack,Brown,Sales,,,2021-05-20,68000.0,Normal,4.0,2
