In [1]:
import pandas as pd
from datetime import datetime
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows

## CONTENTS:
1. Data Frame - Creation with a Dictionary
2. Date Frame - Creation from a list of lists
3. Data Frame - Creation from a list of lists - with unique Index
4. Data Frame - Creation from a CSV or Excel spreadsheet
5. Data Types - exploring column data types
6. Columns - Remove an unnecessary column
7. Sort - apply a sort to an existing dataframe
8. Index - reset index numbers after a sort
9. Index - Drop an Index from a dataframe when exporting
10. Find a specific row - based on a search string
11. Find rows based on filtering numbers (greater than)
12. Find rows based on str.contains (similar to SQL like)
12. Find a specific row - based on Index Number
13. Find specific cell data - based on Index Number
14. Find specific cell data - based on a search
15. Series - convert a series to Dataframe and a List
16. Add a row
17. Add a row - based on previous row data
18. Update a cell - based on Index Number
19. Columns - adding a new Column
20. Columns - deleting a column
21. Columns - create a new column from a sum
22. Columns - create a new column with the APPLY() method
23. Columns - change data type - FLOAT to INTEGER
24. Columns - create a new string column based on logic in APPLY() method
25. Columns and Dates - create a year and month column from a date
26. Columns - change the names of your columns
27. Data Frame - use the APPLY() method with a LAMBDA function on an entire dataframe
27. NaN - use ISNULL() to find null values
27. NaN - use na=False to find NaN AND non-string values
28. NaN - convert missing numerical data to 0 with FILLNA()
29. Group By - counting columns
30. Group By - counting by percentage
31. Pivot - PIVOT() example
32. Pivot 2 - PIVOT() example with grouped dates sales
33. Head and Tail
34. Pivot - PIVOT_TABLE() - aggregated functions
35. Pivot table percentages - FLOAT to 2 decimal places
36. openpyxl - save dataframes to different Excel workbook sheets

# 1. Data Frame - Creation with a Dictionary
When using a dictionary to create a dataframe, the key becomes a column name. The array assigned to each key is the column data.

In [2]:
european_cities = {
    "cities":["Istanbul","Moscow","Paris","London","Madrid"],
    "population":[15000000,12000000,11000000,9000000,6000000]
}
df = pd.DataFrame(european_cities)
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# 2. Date Frame - Creation from a list of lists

In [3]:
european_cities_list = [
    ["Istanbul",15000000],
    ["Moscow",12000000],
    ["Paris",11000000],
    ["London",9000000],
    ["Madrid",6000000]
]
df = pd.DataFrame(european_cities_list, columns=["cities","population"])
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# 3. Data Frame - Creation from a list of lists - with unique Index
You can setup the dataframe to have a unique index instead of an auto incremented number.

In [4]:
df_unq_index = pd.DataFrame(
    [[15000000],[12000000],[11000000]], 
    index=['Istanbul','Moscow','Paris'], columns=['population'])
df_unq_index

Unnamed: 0,population
Istanbul,15000000
Moscow,12000000
Paris,11000000


# 4. Data Frame - Creation from a CSV or Excel spreadsheet

- See below for more options working with Excel using openpyxl.
- See 'Drop an Index from a dataframe when exporting' to drop an index when exporting to CSV or XLSX files.

In [5]:
# Save the population dataframe to a csv
df.to_csv('populations.csv')
# OPEN from a csv file
df = pd.read_csv('populations.csv')
df

Unnamed: 0.1,Unnamed: 0,cities,population
0,0,Istanbul,15000000
1,1,Moscow,12000000
2,2,Paris,11000000
3,3,London,9000000
4,4,Madrid,6000000


# 5. Data Types - exploring column data types

- Strings are classed as 'object'

In [6]:
df_food = pd.DataFrame(
    [["Chicken",3,14.5,False,datetime(2022,2,15)],["Carrots",2,23.5,True,datetime(2022,2,15)],["Bread",1,18,True,datetime(2022,2,15)]], 
    columns=["Product","Price","Discount","In stock","Next delivery date"]
)
df_food.dtypes

Product                       object
Price                          int64
Discount                     float64
In stock                        bool
Next delivery date    datetime64[ns]
dtype: object

# 6. Columns - Remove an unnecessary column

- See 'Index' below for how to drop an index when exporting to csv or xlsx files.

In [7]:
del df['Unnamed: 0']
df

Unnamed: 0,cities,population
0,Istanbul,15000000
1,Moscow,12000000
2,Paris,11000000
3,London,9000000
4,Madrid,6000000


# 7. Sort - apply a sort to existing dataframe

In [8]:
df.sort_values(by=["population"], ascending=True, inplace=True)
df

Unnamed: 0,cities,population
4,Madrid,6000000
3,London,9000000
2,Paris,11000000
1,Moscow,12000000
0,Istanbul,15000000


# 8. Index - reset index numbers after a sort (or any function)

In [9]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000


# 9. Index - Drop an Index from a dataframe when exporting
- Before saving a dataframe to another format such as an Excel spreadsheet, you may want to remove the index from the data frame.

In [10]:
df.to_csv('populations2.csv', index=False)

# 10. Find a specific row - based on a search string

We want to find a row/ rows where the cities column is "Paris".
- This will return a DataFrame

In [11]:
df[df["cities"] == "Paris"]

Unnamed: 0,cities,population
2,Paris,11000000


# 11. Find rows based on str.contains (similar to SQL like)
- Here we want to find companies that have profit losses in billions (marked with a B).
- We can use str.contains to filter strings.

In [12]:
df_global_sales = pd.DataFrame(
    {
        "company":["CompA","CompB"],
        "sales":["$12.2 B","$94.2 M"],
        "profit":["$3.2 B","$-2.5 B"]
    }
)
df_global_sales[df_global_sales['profit'].str.contains('-') & df_global_sales['profit'].str.contains('B')]

Unnamed: 0,company,sales,profit
1,CompB,$94.2 M,$-2.5 B


# 12. Find rows based on filtering numbers (greater than)

In [13]:
df[df['population'] > 10000000]

Unnamed: 0,cities,population
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000


In [14]:
# Multiple statement filtering - separate as tuples
df[(df['population'] > 10000000) & (df['cities'] != 'Istanbul')]

Unnamed: 0,cities,population
2,Paris,11000000
3,Moscow,12000000


# 13. Find a specific row - based on Index Number

Here you can find row data based on index number.

In [15]:
# Returns a <class 'pandas.core.series.Series'>
df.loc[2]

cities           Paris
population    11000000
Name: 2, dtype: object

# 14. Find specific cell data - based on Index Number

In [16]:
df.loc[2]["population"]

11000000

# 15. Find specific cell data - based on a search

- Here we find only the 'Sales Total' results for the 'Salesperson' named 'Paul'.
- The steps are broken down to show how the types returned.

In [17]:
df_sp = pd.DataFrame([["Paul",50],["Stacy",100],["Paul",80]], columns=["Salesperson","Sales Total"])
df_sp

Unnamed: 0,Salesperson,Sales Total
0,Paul,50
1,Stacy,100
2,Paul,80


In [18]:
# Get all index numbers where "Salesperson" == "Paul"
paul_rows = df_sp[df_sp["Salesperson"] == "Paul"].index
print(type(paul_rows))
print(paul_rows)

# Pass "paul_rows" index numbers to 'df.loc[]' to return a series
paul_sales_totals = df_sp.loc[paul_rows]["Sales Total"]
print(type(paul_sales_totals))
print(paul_sales_totals)

#Convert series to a data frame
pd.DataFrame(paul_sales_totals)

<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([0, 2], dtype='int64')
<class 'pandas.core.series.Series'>
0    50
2    80
Name: Sales Total, dtype: int64


Unnamed: 0,Sales Total
0,50
2,80


# 16. Series - convert a series to Dataframe and a List

- Getting specific columns returns a series rather than a dataframe
- You can also convert a Series to a tuple, set or dictionary

In [19]:
series_population = df.loc[[0,1]]['population']
type(series_population)

pandas.core.series.Series

In [20]:
# Convert series to a dataframe
pd.DataFrame(series_population)

Unnamed: 0,population
0,6000000
1,9000000


In [21]:
# Convert a series to a list - returns list of series values
list(series_population)

[6000000, 9000000]

# 17. Add a row

In [22]:
# First get latest row number
next_row = len(df) 
print(next_row)
# Create a new row and add data
df.loc[next_row] = ['Berlin',3700000]
df

5


Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000
5,Berlin,3700000


# 18. Add a row - based on previous row data

- Here we take the previous 'Investment Total' figure.
- We round it down to the nearest hundred.
- We calculate 2% of the rounded number.
- We then add the 2% to the previous 'Investment Total' in a newly created row:

In [23]:
import math

# Function to round figure down to the nearest hundred
def round_down_to_hundred(total):
    if total < 100:
        return 0
    total_reduced = str(math.floor(total/100))
    return int(total_reduced + "00")

#Test function works
round_down_to_hundred(23706.0)

23700

In [24]:
# New data frame
df_investment = pd.DataFrame([[45625],[45735]], columns=["Investment Total"])

# Get 'Investment Total' from most recent row
current_total = df_investment.loc[len(df_investment)-1]['Investment Total']

# Get next row number
next_row = len(df_investment)

# Create the next row adding 2% to the current total rounded down to the nearest hundred
df_investment.loc[next_row] = [current_total + ((round_down_to_hundred(current_total)) * 0.02)]

df_investment

Unnamed: 0,Investment Total
0,45625.0
1,45735.0
2,46649.0


# 19. Update a cell - based on Index Number

In [25]:
# Update Madrid population and change it back.
df.at[0,'population'] = 5000000
print(df)
df.at[0,'population'] = 6000000
df

     cities  population
0    Madrid     5000000
1    London     9000000
2     Paris    11000000
3    Moscow    12000000
4  Istanbul    15000000
5    Berlin     3700000


Unnamed: 0,cities,population
0,Madrid,6000000
1,London,9000000
2,Paris,11000000
3,Moscow,12000000
4,Istanbul,15000000
5,Berlin,3700000


# 20. Columns - adding a new Column

In [26]:
# Using df_sp (Salesperson sales totals from above)
# All columns will have the same data
df_sp['Example Column'] = 10
df_sp

Unnamed: 0,Salesperson,Sales Total,Example Column
0,Paul,50,10
1,Stacy,100,10
2,Paul,80,10


# 21. Columns - Deleting a column

In [27]:
del df_sp['Example Column']
df_sp

Unnamed: 0,Salesperson,Sales Total
0,Paul,50
1,Stacy,100
2,Paul,80


# 22. Columns - create a new column from a sum

- Here we are going to create a 'Sales Quantity' column based on an assumption that the items being sold are valued at 5 each.

In [28]:
df_sp['Sales Quantity'] = df_sp['Sales Total'] / 5
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity
0,Paul,50,10.0
1,Stacy,100,20.0
2,Paul,80,16.0


# 23. Columns - create a new column with the APPLY() method

- Here we create a new column based on applying a function to existing column data.

In [29]:
def divide_by_10(sales_total):
    return sales_total / 10
df_sp['New Sales Quantity'] = df_sp['Sales Total'].apply(divide_by_10)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity
0,Paul,50,10.0,5.0
1,Stacy,100,20.0,10.0
2,Paul,80,16.0,8.0


# 24. Columns - change data type - FLOAT to INTEGER

- We have some columns containing floats and some containing ints after previous column creation with calculations.
- In this case, we know the sum result will be an int and we want to convert from float to an int

In [30]:
df_sp.dtypes

Salesperson            object
Sales Total             int64
Sales Quantity        float64
New Sales Quantity    float64
dtype: object

- We can change the type of columns.

In [31]:
df_sp['Sales Quantity'] = df_sp['Sales Quantity'].astype(int)
df_sp['New Sales Quantity'] = df_sp['New Sales Quantity'].astype(int)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity
0,Paul,50,10,5
1,Stacy,100,20,10
2,Paul,80,16,8


# 25. Columns - create a new string column based on logic in APPLY() method

- Here we create a new string column based on sales performance.

In [32]:
def good_bad(sales_total):
    try:
        if sales_total > 60:
            return 'Good'
        else:
            return 'Bad'
    except:
        return 'N/A'
    
df_sp['Performance'] = df_sp['Sales Total'].apply(good_bad)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity,Performance
0,Paul,50,10,5,Bad
1,Stacy,100,20,10,Good
2,Paul,80,16,8,Good


# 26. Columns and Dates - create a year and month column from a date

In [33]:
df_sp['Date'] = datetime(2022,1,7)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity,Performance,Date
0,Paul,50,10,5,Bad,2022-01-07
1,Stacy,100,20,10,Good,2022-01-07
2,Paul,80,16,8,Good,2022-01-07


In [34]:
def return_month(x):
    try:
        return x.month_name()
    except:
        return x
    
def return_year(x):
    try:
        return x.year
    except:
        return x

df_sp['Month'] = df_sp['Date'].apply(return_month)
df_sp['Year'] = df_sp['Date'].apply(return_year)
df_sp

Unnamed: 0,Salesperson,Sales Total,Sales Quantity,New Sales Quantity,Performance,Date,Month,Year
0,Paul,50,10,5,Bad,2022-01-07,January,2022
1,Stacy,100,20,10,Good,2022-01-07,January,2022
2,Paul,80,16,8,Good,2022-01-07,January,2022


# 27. Columns - change the names of your columns

- Often, if you've pulled the data fro somewhere else to create a dataframe you might want to rename your columns.

In [35]:
df_change_cols = pd.DataFrame([["Steve",100],["Jenny",120]], columns=["A","B"])
df_change_cols

Unnamed: 0,A,B
0,Steve,100
1,Jenny,120


In [36]:
df_change_cols.columns = ["Name", "Test Score"]
df_change_cols

Unnamed: 0,Name,Test Score
0,Steve,100
1,Jenny,120


# 28. Data Frame - use the APPLY() method with a LAMBDA function on an entire dataframe

- It's more common to use the apply method on individual columns but you can apply functions to an entire dataframe.
- NOTE! This doesn't affect the existing dataframe - so you must ASSIGN to a dataframe.

In [37]:
df_nums = pd.DataFrame([[1,2],[3,4]], columns=['A','B'])
print(df_nums)
df_nums = df_nums.apply(lambda x: x + 5)
df_nums

   A  B
0  1  2
1  3  4


Unnamed: 0,A,B
0,6,7
1,8,9


# 29. NaN - use ISNULL() to find null values
- This will only find NaN values BUT see below to find NaN values and non-string values.

In [38]:
import numpy as np
df_find_nans = pd.DataFrame({"test_col":['$26 M','$32 B',np.nan,0]})
df_find_nans

Unnamed: 0,test_col
0,$26 M
1,$32 B
2,
3,0


In [39]:
# Find null values in a column
df_find_nans[df_find_nans['test_col'].isnull()]

Unnamed: 0,test_col
2,


# 30. NaN - use na=False to find NaN AND non-strings
- Here we want to test all strings contain a dollar sign plus M or B.
- We have to set nan=false in the str.contains method to return NaN and any non strings.

In [40]:
df_find_nans[df_find_nans['test_col'].str.contains('$', na=False) == False]

Unnamed: 0,test_col
2,
3,0.0


# 31. NaN - convert missing numerical data to 0 with FILLNA()
- NaN means missing data
- Any blank cells from a spreadsheet will be returned as NaN

In [41]:
df_nans = pd.DataFrame({
    'employee':['Bob Jenkins','Jane Willis','Sally Turner','William Jones'],
    'department':['Marketing','HR','IT','Marketing'],
    'salary':[39000,48000,None,39000]
})
df_nans

Unnamed: 0,employee,department,salary
0,Bob Jenkins,Marketing,39000.0
1,Jane Willis,HR,48000.0
2,Sally Turner,IT,
3,William Jones,Marketing,39000.0


In [42]:
df_nans['salary'] = df_nans['salary'].fillna(30000)
df_nans

Unnamed: 0,employee,department,salary
0,Bob Jenkins,Marketing,39000.0
1,Jane Willis,HR,48000.0
2,Sally Turner,IT,30000.0
3,William Jones,Marketing,39000.0


# 32. Group By - counting columns

- Here we can group instances and count them, simlar to SQL grouping.

In [43]:
df_nans['department'].value_counts()

Marketing    2
HR           1
IT           1
Name: department, dtype: int64

In [44]:
df_nans['salary'].value_counts()

39000.0    2
48000.0    1
30000.0    1
Name: salary, dtype: int64

# 33. Group By - counting by percentage

- In this example, normalize=True returns the percentage of the count 
- We then perform a sum and create a percentage string to tidy the results

In [45]:
department_percentage = df_nans['department'].value_counts(normalize=True)
department_percentage

Marketing    0.50
HR           0.25
IT           0.25
Name: department, dtype: float64

In [46]:
# Convert to a dataframe, get percentage as string and delete original percentage figure
def convert_percentage_string(x):
    try:
        x = round(x * 100)
        return str(x) + '%'
    except:
        return x
    
department_percentage = pd.DataFrame(department_percentage)
department_percentage['Percentage'] = department_percentage['department'].apply(convert_percentage_string)

del department_percentage['department']

department_percentage

Unnamed: 0,Percentage
Marketing,50%
HR,25%
IT,25%


# 34. Pivot - PIVOT() example

- Here we have a 'pivotalbe' table with shop and items repeating.
- We want to group together shops and items to see how each is performing.

In [47]:
shop_sales = [["New York","Shoes",3000],["New York","Jackets",4000],["London","Shoes",2000],["London","Jackets",5000]]
df_shop = pd.DataFrame(shop_sales, columns=["Shop","Item","Sales"])
df_shop

Unnamed: 0,Shop,Item,Sales
0,New York,Shoes,3000
1,New York,Jackets,4000
2,London,Shoes,2000
3,London,Jackets,5000


In [48]:
# See how many items are sold in each shop
df_shop.pivot(index="Shop", columns="Item", values="Sales")

Item,Jackets,Shoes
Shop,Unnamed: 1_level_1,Unnamed: 2_level_1
London,5000,2000
New York,4000,3000


In [49]:
# Here we pivot the other way
df_shop.pivot(index="Item", columns="Shop", values="Sales")

Shop,London,New York
Item,Unnamed: 1_level_1,Unnamed: 2_level_1
Jackets,5000,4000
Shoes,2000,3000


# 35. Pivot 2 - PIVOT() example with grouped dates sales

- Here we look at how many sales were performed per salesperson per date

In [50]:
df_dates_arr = [
    ["Dave",'2021-11-17',40],
    ["Sally",'2021-11-17',0],
    ["Bob",'2021-11-17',200],
    ["Dave",'2021-11-18',30],
    ["Sally",'2021-11-18',10],
    ["Bob",'2021-11-18',210],
    ["Dave",'2021-11-19',40],
    ["Sally",'2021-11-19',390],
    ["Bob",'2021-11-19',140]
]
df_dates = pd.DataFrame(df_dates_arr, columns=["Salesperson","Date","Sales"])
df_dates

Unnamed: 0,Salesperson,Date,Sales
0,Dave,2021-11-17,40
1,Sally,2021-11-17,0
2,Bob,2021-11-17,200
3,Dave,2021-11-18,30
4,Sally,2021-11-18,10
5,Bob,2021-11-18,210
6,Dave,2021-11-19,40
7,Sally,2021-11-19,390
8,Bob,2021-11-19,140


In [51]:
df_dates.pivot(index="Date", columns="Salesperson", values="Sales")

Salesperson,Bob,Dave,Sally
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-11-17,200,40,0
2021-11-18,210,30,10
2021-11-19,140,40,390


In [52]:
# We can also pivot the other way with salesperson as the index
df_dates.pivot(index="Salesperson", columns="Date", values="Sales")

Date,2021-11-17,2021-11-18,2021-11-19
Salesperson,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bob,200,210,140
Dave,40,30,40
Sally,0,10,390


# 36. Head and Tail

- head(n) and tail(n) give you the n highest and n lowest totals

In [53]:
sales_data = [
    ["Credit Card","Member","Technology",200,],
    ["Gift Card","Normal","Health and Beauty",100],
    ["Cash","Normal","Technology",120],
    ["Credit Card","Normal","Technology",300],
    ["Credit Card","Member","Health and Beauty",200],
    ["Gift Card","Member","Health and Beauty",100],
    ["Credit Card","Normal","Garden",100],
    ["Cash","Member","Health and Beauty",50],
    ["Credit Card","Normal","Garden",60],
    ["Gift Card","Member","Technology",120],
    ["Cash","Normal","Technology",30],
    ["Gift Card","Deluxe","Health and Beauty",300]
]
df_sales_data = pd.DataFrame(sales_data, columns=["Payment Type","Membership Level","Department","Sales Total"])
df_sales_data

Unnamed: 0,Payment Type,Membership Level,Department,Sales Total
0,Credit Card,Member,Technology,200
1,Gift Card,Normal,Health and Beauty,100
2,Cash,Normal,Technology,120
3,Credit Card,Normal,Technology,300
4,Credit Card,Member,Health and Beauty,200
5,Gift Card,Member,Health and Beauty,100
6,Credit Card,Normal,Garden,100
7,Cash,Member,Health and Beauty,50
8,Credit Card,Normal,Garden,60
9,Gift Card,Member,Technology,120


In [54]:
# Sort data
df_sales_data.sort_values(by="Sales Total", ascending=False, inplace=True)
# 3 highest transactions
df_sales_data.head(3)

Unnamed: 0,Payment Type,Membership Level,Department,Sales Total
3,Credit Card,Normal,Technology,300
11,Gift Card,Deluxe,Health and Beauty,300
0,Credit Card,Member,Technology,200


In [55]:
# 3 lowest transactions
df_sales_data.tail(3)

Unnamed: 0,Payment Type,Membership Level,Department,Sales Total
8,Credit Card,Normal,Garden,60
7,Cash,Member,Health and Beauty,50
10,Cash,Normal,Technology,30


# 37. Pivot - PIVOT_TABLE() - aggregated functions

- You can specify an aggreagate function easily in pandas pivot tables for analysis.

In [56]:
# Payment type totals sorted
df_sales_data.pivot_table(index=["Payment Type"],values=["Sales Total"],aggfunc="sum").sort_values(by="Sales Total", ascending=False)

Unnamed: 0_level_0,Sales Total
Payment Type,Unnamed: 1_level_1
Credit Card,860
Gift Card,620
Cash,200


In [57]:
# Departments by sales - get top performing department
df_top_dept = df_sales_data.pivot_table(index=["Department"], values=["Sales Total"], aggfunc="sum").sort_values(by="Sales Total",ascending=False)
df_top_dept.head(1)

Unnamed: 0_level_0,Sales Total
Department,Unnamed: 1_level_1
Technology,770


In [58]:
# Combining groups of results
# Here we get the results based on department and payment type
by_dept_payment_type = df_sales_data.pivot_table(index=["Department"],columns=["Payment Type"],values=["Sales Total"],aggfunc=sum)
by_dept_payment_type

Unnamed: 0_level_0,Sales Total,Sales Total,Sales Total
Payment Type,Cash,Credit Card,Gift Card
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Garden,,160.0,
Health and Beauty,50.0,200.0,500.0
Technology,150.0,500.0,120.0


- The above result has NaN where 0 sales were made in a department... so we can use fillna() again to tidy the results.
- We also want to return the totals as ints instead of floats...

In [59]:
by_dept_payment_type.fillna(0, inplace=True)
by_dept_payment_type["Sales Total"] = by_dept_payment_type["Sales Total"].astype(int)
by_dept_payment_type

Unnamed: 0_level_0,Sales Total,Sales Total,Sales Total
Payment Type,Cash,Credit Card,Gift Card
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Garden,0,160,0
Health and Beauty,50,200,500
Technology,150,500,120


# 38. Pivot table percentages - FLOAT to 2 decimal places

- Here we create a pivot table to see how much each payment type accounted for.
- Then we create a column calculating percentage of sales and rounding down and converting to a string.

In [60]:
df_membership_level = df_sales_data.pivot_table(index=["Membership Level"],values=["Sales Total"],aggfunc="sum").sort_values(by="Sales Total", ascending=False)
df_membership_level

Unnamed: 0_level_0,Sales Total
Membership Level,Unnamed: 1_level_1
Normal,710
Member,670
Deluxe,300


In [61]:
# Create a new column with a percentage of sales 
df_membership_level['Percentage of sales'] = round((df_membership_level['Sales Total'] / df_membership_level['Sales Total'].sum() * 100),2).astype(str) + " %"
df_membership_level

Unnamed: 0_level_0,Sales Total,Percentage of sales
Membership Level,Unnamed: 1_level_1,Unnamed: 2_level_1
Normal,710,42.26 %
Member,670,39.88 %
Deluxe,300,17.86 %


# 39. openpyxl - save dataframes to different Excel workbook sheets

- Here we will save 2 dataframes to 2 separate sheets in zan Excel workbook.
- With openpyxl you can use the dataframe_to_rows function to copy a dataframe to a new sheet

In [62]:
# Create a workbook
wb = Workbook()

In [63]:
# Create a new sheet
department_and_payment_type = wb.create_sheet(title="Department and Payment Type")

In [64]:
# Here we append rows from the 'by_dept_payment_type' dataframe to the excel sheet
for r in dataframe_to_rows(by_dept_payment_type, index=True, header=False):
    department_and_payment_type.append(r)

In [65]:
# Create a new sheet
membership_level_sales_percentage = wb.create_sheet(title="Membership Sales Percentage")

In [66]:
# Here we append rows from the 'by_dept_payment_type' dataframe to the excel sheet
for r in dataframe_to_rows(df_membership_level, index=True, header=False):
    membership_level_sales_percentage.append(r)

- Finally, save the workbook as a workbook named 'sales_analysis.xlsx'.  The workbook should have the 2 dataframes above saved into the 2 sheets named above.

In [67]:
wb.save("sales_analysis.xlsx")