In [1]:
import pandas as pd

In [2]:
df_nans = pd.DataFrame({
    'employee':['Bob Jenkins','Jane Willis','Sally Turner','William Jones','Steven Jobson'],
    'department':['Marketing','HR','IT','Marketing','Marketing'],
    'salary':[44000,48000,None,38000,38000]
})

df_nans['salary'] = df_nans['salary'].fillna(30000)
df_nans

Unnamed: 0,employee,department,salary
0,Bob Jenkins,Marketing,44000.0
1,Jane Willis,HR,48000.0
2,Sally Turner,IT,30000.0
3,William Jones,Marketing,38000.0
4,Steven Jobson,Marketing,38000.0


# 1. Group By - counting columns

- Similar to SQL COUNT(*)

In [3]:
df_nans['department'].value_counts()

Marketing    3
HR           1
IT           1
Name: department, dtype: int64

In [4]:
df_nans['salary'].value_counts()

38000.0    2
44000.0    1
48000.0    1
30000.0    1
Name: salary, dtype: int64

# 2. Group By - counting by percentage

- In this example, normalize=True returns the percentage of the count 
- We then perform a sum and create a percentage string to tidy the results

In [5]:
department_percentage = df_nans['department'].value_counts(normalize=True)
department_percentage

Marketing    0.6
HR           0.2
IT           0.2
Name: department, dtype: float64

In [6]:
# Convert raw percentage to a % string....

# Convert to a dataframe, get percentage as string and delete original percentage figure
def convert_percentage_string(x):
    try:
        x = round(x * 100)
        return str(x) + '%'
    except:
        return x
    
department_percentage = pd.DataFrame(department_percentage)
department_percentage['Percentage'] = department_percentage['department'].apply(convert_percentage_string)

del department_percentage['department']

department_percentage

Unnamed: 0,Percentage
Marketing,60%
HR,20%
IT,20%


# 3. Group By - SUM of a group

In [7]:
# Group By Department (and sum salaries)
df_salary_totals = df_nans.groupby('department').sum()

# Reset Index
df_salary_totals.reset_index(inplace=True)

df_salary_totals

Unnamed: 0,department,salary
0,HR,48000.0
1,IT,30000.0
2,Marketing,120000.0


# 4. Group By - AVERAGE of a group

In [8]:
# Group By Department (and sum salaries)
df_salary_averages = df_nans.groupby('department').mean()

# Reset Index
df_salary_averages.reset_index(inplace=True)

df_salary_averages

Unnamed: 0,department,salary
0,HR,48000.0
1,IT,30000.0
2,Marketing,40000.0


# 5. Joins - Inner Join dataframes

- We have 3 dataframes
- A salespeople dataframe , a products dataframe and a sales dataframe.
- The sales dataframe records the salesperson_id and product_id for each sale.
- We can use **pd.merge()** to perform joins in a similar way we would with SQL.

In [9]:
salespeople = {
    "salesperson_id":[1,2,3,4],
    "name":["Bob","Janet","Claire","Steve"],
}
df_salespeople = pd.DataFrame(
    salespeople
)

df_salespeople

Unnamed: 0,salesperson_id,name
0,1,Bob
1,2,Janet
2,3,Claire
3,4,Steve


In [10]:
products = {
    "product_id":[1,2],
    "name":["Laptop","Desktop PC"]
}

df_products = pd.DataFrame(products)

df_products

Unnamed: 0,product_id,name
0,1,Laptop
1,2,Desktop PC


In [11]:
sales = {
    "sales_id":[1,2,3],
    "product_id":[2,2,1],
    "salesperson_id":[3,4,2]
}

df_sales = pd.DataFrame(sales)

df_sales

Unnamed: 0,sales_id,product_id,salesperson_id
0,1,2,3
1,2,2,4
2,3,1,2


## Table merging (inner join)

In [12]:
df_sales_with_salesperson_names = pd.merge(df_sales, df_salespeople, how="inner", on = "salesperson_id")
df_sales_with_salesperson_names

Unnamed: 0,sales_id,product_id,salesperson_id,name
0,1,2,3,Claire
1,2,2,4,Steve
2,3,1,2,Janet


In [13]:
df_sales_with_information = pd.merge(df_sales_with_salesperson_names, df_products, how="inner", on="product_id")
df_sales_with_information

Unnamed: 0,sales_id,product_id,salesperson_id,name_x,name_y
0,1,2,3,Claire,Desktop PC
1,2,2,4,Steve,Desktop PC
2,3,1,2,Janet,Laptop


# 6. Joins - Left Join  dataframes

In [14]:
# Salespeople contact details dataframe
df_salespeople_emails = pd.DataFrame({
    "salesperson_id":[1,2,3],
    "email":["bob@team.org","janet@team.org","claire@team.org"]
})
df_salespeople_emails

Unnamed: 0,salesperson_id,email
0,1,bob@team.org
1,2,janet@team.org
2,3,claire@team.org


In [15]:
# Left join result showing us which sales people do not have an email address. In this case Steve...
df_salespeople_info = pd.merge(df_salespeople, df_salespeople_emails, how="left", on="salesperson_id")
df_salespeople_info

Unnamed: 0,salesperson_id,name,email
0,1,Bob,bob@team.org
1,2,Janet,janet@team.org
2,3,Claire,claire@team.org
3,4,Steve,
